2018-04-03 19:23:33 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2007-06-12 09:07:21 -04:00
|
|
|
/*
|
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
|
*/
|
|
|
|
|
|
2020-06-12 16:57:37 +10:00
|
|
|
#include <crypto/hash.h>
|
2008-04-25 16:53:30 -04:00
|
|
|
#include <linux/kernel.h>
|
2008-02-20 12:07:25 -05:00
|
|
|
#include <linux/bio.h>
|
2021-09-20 14:33:12 +02:00
|
|
|
#include <linux/blk-cgroup.h>
|
2008-05-02 14:43:14 -04:00
|
|
|
#include <linux/file.h>
|
2007-06-12 06:35:45 -04:00
|
|
|
#include <linux/fs.h>
|
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
|
#include <linux/time.h>
|
|
|
|
|
#include <linux/init.h>
|
|
|
|
|
#include <linux/string.h>
|
|
|
|
|
#include <linux/backing-dev.h>
|
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
|
#include <linux/compat.h>
|
2007-11-16 11:45:54 -05:00
|
|
|
#include <linux/xattr.h>
|
2008-07-24 12:16:36 -04:00
|
|
|
#include <linux/posix_acl.h>
|
2008-10-30 14:25:28 -04:00
|
|
|
#include <linux/falloc.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
|
|
|
#include <linux/slab.h>
|
2011-05-06 15:33:15 +02:00
|
|
|
#include <linux/ratelimit.h>
|
2013-01-29 06:04:50 +00:00
|
|
|
#include <linux/btrfs.h>
|
2013-01-29 18:40:14 -05:00
|
|
|
#include <linux/blkdev.h>
|
2013-06-19 10:16:26 -04:00
|
|
|
#include <linux/posix_acl_xattr.h>
|
2015-02-22 08:58:50 -08:00
|
|
|
#include <linux/uio.h>
|
2017-10-19 14:15:57 -04:00
|
|
|
#include <linux/magic.h>
|
2018-01-29 06:41:30 -05:00
|
|
|
#include <linux/iversion.h>
|
2016-11-03 10:28:14 -07:00
|
|
|
#include <linux/swap.h>
|
2020-03-04 16:57:35 -08:00
|
|
|
#include <linux/migrate.h>
|
2019-04-01 11:29:57 +03:00
|
|
|
#include <linux/sched/mm.h>
|
2020-08-17 11:18:21 -05:00
|
|
|
#include <linux/iomap.h>
|
2018-04-16 21:10:14 +02:00
|
|
|
#include <asm/unaligned.h>
|
2021-06-30 13:01:49 -07:00
|
|
|
#include <linux/fsverity.h>
|
2019-08-21 18:48:25 +02:00
|
|
|
#include "misc.h"
|
2007-06-12 06:35:45 -04:00
|
|
|
#include "ctree.h"
|
|
|
|
|
#include "disk-io.h"
|
|
|
|
|
#include "transaction.h"
|
|
|
|
|
#include "btrfs_inode.h"
|
|
|
|
|
#include "print-tree.h"
|
2008-07-17 12:53:50 -04:00
|
|
|
#include "ordered-data.h"
|
2008-08-28 06:21:17 -04:00
|
|
|
#include "xattr.h"
|
2008-09-05 16:13:11 -04:00
|
|
|
#include "tree-log.h"
|
2011-07-22 15:41:52 +02:00
|
|
|
#include "volumes.h"
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
#include "compression.h"
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 09:25:08 -05:00
|
|
|
#include "locking.h"
|
2011-01-28 17:05:48 -05:00
|
|
|
#include "free-space-cache.h"
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
#include "props.h"
|
2014-12-12 16:44:35 +08:00
|
|
|
#include "qgroup.h"
|
2019-06-19 15:12:00 -04:00
|
|
|
#include "delalloc-space.h"
|
2019-06-20 15:37:44 -04:00
|
|
|
#include "block-group.h"
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
#include "space-info.h"
|
2021-02-04 19:22:05 +09:00
|
|
|
#include "zoned.h"
|
2021-05-31 16:50:46 +08:00
|
|
|
#include "subpage.h"
|
2021-12-03 17:18:03 -05:00
|
|
|
#include "inode-item.h"
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
struct btrfs_iget_args {
|
2020-05-15 19:35:59 +02:00
|
|
|
u64 ino;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *root;
|
|
|
|
|
};
|
|
|
|
|
|
2015-12-08 19:23:20 +00:00
|
|
|
struct btrfs_dio_data {
|
2020-08-17 11:18:21 -05:00
|
|
|
ssize_t submitted;
|
|
|
|
|
struct extent_changeset *data_reserved;
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
bool data_space_reserved;
|
|
|
|
|
bool nocow_done;
|
2015-12-08 19:23:20 +00:00
|
|
|
};
|
|
|
|
|
|
2022-05-05 15:11:14 -05:00
|
|
|
struct btrfs_dio_private {
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Since DIO can use anonymous page, we cannot use page_offset() to
|
|
|
|
|
* grab the file offset, thus need a dedicated member for file offset.
|
|
|
|
|
*/
|
|
|
|
|
u64 file_offset;
|
|
|
|
|
/* Used for bio::bi_size */
|
|
|
|
|
u32 bytes;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* References to this structure. There is one reference per in-flight
|
|
|
|
|
* bio plus one while we're still setting up.
|
|
|
|
|
*/
|
|
|
|
|
refcount_t refs;
|
|
|
|
|
|
|
|
|
|
/* Array of checksums */
|
2022-05-05 15:11:15 -05:00
|
|
|
u8 *csums;
|
|
|
|
|
|
|
|
|
|
/* This must be last */
|
|
|
|
|
struct bio bio;
|
2022-05-05 15:11:14 -05:00
|
|
|
};
|
|
|
|
|
|
2022-05-05 15:11:15 -05:00
|
|
|
static struct bio_set btrfs_dio_bioset;
|
|
|
|
|
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
struct btrfs_rename_ctx {
|
|
|
|
|
/* Output field. Stores the index number of the old directory entry. */
|
|
|
|
|
u64 index;
|
|
|
|
|
};
|
|
|
|
|
|
2009-09-21 17:01:11 -07:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations;
|
|
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations;
|
|
|
|
|
static const struct inode_operations btrfs_special_inode_operations;
|
|
|
|
|
static const struct inode_operations btrfs_file_inode_operations;
|
2009-09-21 17:01:10 -07:00
|
|
|
static const struct address_space_operations btrfs_aops;
|
2009-10-01 15:43:56 -07:00
|
|
|
static const struct file_operations btrfs_dir_file_operations;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
static struct kmem_cache *btrfs_inode_cachep;
|
|
|
|
|
struct kmem_cache *btrfs_trans_handle_cachep;
|
|
|
|
|
struct kmem_cache *btrfs_path_cachep;
|
2011-01-28 17:05:48 -05:00
|
|
|
struct kmem_cache *btrfs_free_space_cachep;
|
btrfs: fix allocation of free space cache v1 bitmap pages
Various notifications of type "BUG kmalloc-4096 () : Redzone
overwritten" have been observed recently in various parts of the kernel.
After some time, it has been made a relation with the use of BTRFS
filesystem and with SLUB_DEBUG turned on.
[ 22.809700] BUG kmalloc-4096 (Tainted: G W ): Redzone overwritten
[ 22.810286] INFO: 0xbe1a5921-0xfbfc06cd. First byte 0x0 instead of 0xcc
[ 22.810866] INFO: Allocated in __load_free_space_cache+0x588/0x780 [btrfs] age=22 cpu=0 pid=224
[ 22.811193] __slab_alloc.constprop.26+0x44/0x70
[ 22.811345] kmem_cache_alloc_trace+0xf0/0x2ec
[ 22.811588] __load_free_space_cache+0x588/0x780 [btrfs]
[ 22.811848] load_free_space_cache+0xf4/0x1b0 [btrfs]
[ 22.812090] cache_block_group+0x1d0/0x3d0 [btrfs]
[ 22.812321] find_free_extent+0x680/0x12a4 [btrfs]
[ 22.812549] btrfs_reserve_extent+0xec/0x220 [btrfs]
[ 22.812785] btrfs_alloc_tree_block+0x178/0x5f4 [btrfs]
[ 22.813032] __btrfs_cow_block+0x150/0x5d4 [btrfs]
[ 22.813262] btrfs_cow_block+0x194/0x298 [btrfs]
[ 22.813484] commit_cowonly_roots+0x44/0x294 [btrfs]
[ 22.813718] btrfs_commit_transaction+0x63c/0xc0c [btrfs]
[ 22.813973] close_ctree+0xf8/0x2a4 [btrfs]
[ 22.814107] generic_shutdown_super+0x80/0x110
[ 22.814250] kill_anon_super+0x18/0x30
[ 22.814437] btrfs_kill_super+0x18/0x90 [btrfs]
[ 22.814590] INFO: Freed in proc_cgroup_show+0xc0/0x248 age=41 cpu=0 pid=83
[ 22.814841] proc_cgroup_show+0xc0/0x248
[ 22.814967] proc_single_show+0x54/0x98
[ 22.815086] seq_read+0x278/0x45c
[ 22.815190] __vfs_read+0x28/0x17c
[ 22.815289] vfs_read+0xa8/0x14c
[ 22.815381] ksys_read+0x50/0x94
[ 22.815475] ret_from_syscall+0x0/0x38
Commit 69d2480456d1 ("btrfs: use copy_page for copying pages instead of
memcpy") changed the way bitmap blocks are copied. But allthough bitmaps
have the size of a page, they were allocated with kzalloc().
Most of the time, kzalloc() allocates aligned blocks of memory, so
copy_page() can be used. But when some debug options like SLAB_DEBUG are
activated, kzalloc() may return unaligned pointer.
On powerpc, memcpy(), copy_page() and other copying functions use
'dcbz' instruction which provides an entire zeroed cacheline to avoid
memory read when the intention is to overwrite a full line. Functions
like memcpy() are writen to care about partial cachelines at the start
and end of the destination, but copy_page() assumes it gets pages. As
pages are naturally cache aligned, copy_page() doesn't care about
partial lines. This means that when copy_page() is called with a
misaligned pointer, a few leading bytes are zeroed.
To fix it, allocate bitmaps through kmem_cache instead of using kzalloc()
The cache pool is created with PAGE_SIZE alignment constraint.
Reported-by: Erhard F. <erhard_f@mailbox.org>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204371
Fixes: 69d2480456d1 ("btrfs: use copy_page for copying pages instead of memcpy")
Cc: stable@vger.kernel.org # 4.19+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: David Sterba <dsterba@suse.com>
[ rename to btrfs_free_space_bitmap ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-21 15:05:55 +00:00
|
|
|
struct kmem_cache *btrfs_free_space_bitmap_cachep;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2013-01-12 02:57:22 +00:00
|
|
|
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
|
2018-02-06 20:40:31 +00:00
|
|
|
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
|
2020-06-03 08:55:14 +03:00
|
|
|
static noinline int cow_file_range(struct btrfs_inode *inode,
|
2008-11-06 22:02:51 -05:00
|
|
|
struct page *locked_page,
|
2019-07-17 16:18:16 +03:00
|
|
|
u64 start, u64 end, int *page_started,
|
2022-07-09 08:18:49 +09:00
|
|
|
unsigned long *nr_written, int unlock,
|
|
|
|
|
u64 *done_offset);
|
2020-06-03 08:55:05 +03:00
|
|
|
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
|
|
|
|
|
u64 len, u64 orig_start, u64 block_start,
|
2017-01-31 07:50:22 -08:00
|
|
|
u64 block_len, u64 orig_block_len,
|
|
|
|
|
u64 ram_bytes, int compress_type,
|
|
|
|
|
int type);
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2020-09-24 11:39:16 -05:00
|
|
|
/*
|
|
|
|
|
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
|
|
|
|
|
*
|
|
|
|
|
* ilock_flags can have the following bit set:
|
|
|
|
|
*
|
|
|
|
|
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
|
|
|
|
|
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
|
|
|
|
|
* return -EAGAIN
|
2021-02-10 17:14:33 -05:00
|
|
|
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
|
2020-09-24 11:39:16 -05:00
|
|
|
*/
|
|
|
|
|
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
|
|
|
|
|
{
|
|
|
|
|
if (ilock_flags & BTRFS_ILOCK_SHARED) {
|
|
|
|
|
if (ilock_flags & BTRFS_ILOCK_TRY) {
|
|
|
|
|
if (!inode_trylock_shared(inode))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
inode_lock_shared(inode);
|
|
|
|
|
} else {
|
|
|
|
|
if (ilock_flags & BTRFS_ILOCK_TRY) {
|
|
|
|
|
if (!inode_trylock(inode))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
inode_lock(inode);
|
|
|
|
|
}
|
2021-02-10 17:14:33 -05:00
|
|
|
if (ilock_flags & BTRFS_ILOCK_MMAP)
|
|
|
|
|
down_write(&BTRFS_I(inode)->i_mmap_lock);
|
2020-09-24 11:39:16 -05:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* btrfs_inode_unlock - unock inode i_rwsem
|
|
|
|
|
*
|
|
|
|
|
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
|
|
|
|
|
* to decide whether the lock acquired is shared or exclusive.
|
|
|
|
|
*/
|
|
|
|
|
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
|
|
|
|
|
{
|
2021-02-10 17:14:33 -05:00
|
|
|
if (ilock_flags & BTRFS_ILOCK_MMAP)
|
|
|
|
|
up_write(&BTRFS_I(inode)->i_mmap_lock);
|
2020-09-24 11:39:16 -05:00
|
|
|
if (ilock_flags & BTRFS_ILOCK_SHARED)
|
|
|
|
|
inode_unlock_shared(inode);
|
|
|
|
|
else
|
|
|
|
|
inode_unlock(inode);
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 10:25:52 +08:00
|
|
|
/*
|
|
|
|
|
* Cleanup all submitted ordered extents in specified range to handle errors
|
2018-11-28 12:05:13 +01:00
|
|
|
* from the btrfs_run_delalloc_range() callback.
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 10:25:52 +08:00
|
|
|
*
|
|
|
|
|
* NOTE: caller must ensure that when an error happens, it can not call
|
|
|
|
|
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
|
|
|
|
|
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
|
|
|
|
|
* to be released, which we want to happen only when finishing the ordered
|
2018-11-21 17:10:52 +02:00
|
|
|
* extent (btrfs_finish_ordered_io()).
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 10:25:52 +08:00
|
|
|
*/
|
2020-06-03 08:55:25 +03:00
|
|
|
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
|
2018-11-21 17:10:52 +02:00
|
|
|
struct page *locked_page,
|
|
|
|
|
u64 offset, u64 bytes)
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 10:25:52 +08:00
|
|
|
{
|
2017-09-01 17:58:47 +09:00
|
|
|
unsigned long index = offset >> PAGE_SHIFT;
|
|
|
|
|
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
|
2022-06-21 15:41:00 +09:00
|
|
|
u64 page_start, page_end;
|
2017-09-01 17:58:47 +09:00
|
|
|
struct page *page;
|
|
|
|
|
|
2022-06-21 15:41:00 +09:00
|
|
|
if (locked_page) {
|
|
|
|
|
page_start = page_offset(locked_page);
|
|
|
|
|
page_end = page_start + PAGE_SIZE - 1;
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-01 17:58:47 +09:00
|
|
|
while (index <= end_index) {
|
2021-05-18 15:09:41 +08:00
|
|
|
/*
|
|
|
|
|
* For locked page, we will call end_extent_writepage() on it
|
|
|
|
|
* in run_delalloc_range() for the error handling. That
|
|
|
|
|
* end_extent_writepage() function will call
|
|
|
|
|
* btrfs_mark_ordered_io_finished() to clear page Ordered and
|
|
|
|
|
* run the ordered extent accounting.
|
|
|
|
|
*
|
|
|
|
|
* Here we can't just clear the Ordered bit, or
|
|
|
|
|
* btrfs_mark_ordered_io_finished() would skip the accounting
|
|
|
|
|
* for the page range, and the ordered extent will never finish.
|
|
|
|
|
*/
|
2022-06-21 15:41:00 +09:00
|
|
|
if (locked_page && index == (page_start >> PAGE_SHIFT)) {
|
2021-05-18 15:09:41 +08:00
|
|
|
index++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-06-03 08:55:25 +03:00
|
|
|
page = find_get_page(inode->vfs_inode.i_mapping, index);
|
2017-09-01 17:58:47 +09:00
|
|
|
index++;
|
|
|
|
|
if (!page)
|
|
|
|
|
continue;
|
2021-05-18 15:09:41 +08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Here we just clear all Ordered bits for every page in the
|
2022-06-19 08:07:05 +02:00
|
|
|
* range, then btrfs_mark_ordered_io_finished() will handle
|
2021-05-18 15:09:41 +08:00
|
|
|
* the ordered extent accounting for the range.
|
|
|
|
|
*/
|
2021-05-31 16:50:46 +08:00
|
|
|
btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
|
|
|
|
|
offset, bytes);
|
2017-09-01 17:58:47 +09:00
|
|
|
put_page(page);
|
|
|
|
|
}
|
2018-11-21 17:10:52 +02:00
|
|
|
|
2022-06-21 15:41:00 +09:00
|
|
|
if (locked_page) {
|
|
|
|
|
/* The locked page covers the full range, nothing needs to be done */
|
|
|
|
|
if (bytes + offset <= page_start + PAGE_SIZE)
|
|
|
|
|
return;
|
|
|
|
|
/*
|
|
|
|
|
* In case this page belongs to the delalloc range being
|
|
|
|
|
* instantiated then skip it, since the first page of a range is
|
|
|
|
|
* going to be properly cleaned up by the caller of
|
|
|
|
|
* run_delalloc_range
|
|
|
|
|
*/
|
|
|
|
|
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
|
|
|
|
|
bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
|
|
|
|
|
offset = page_offset(locked_page) + PAGE_SIZE;
|
|
|
|
|
}
|
2018-11-21 17:10:52 +02:00
|
|
|
}
|
|
|
|
|
|
2022-06-19 08:07:05 +02:00
|
|
|
return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 10:25:52 +08:00
|
|
|
}
|
|
|
|
|
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_dirty_inode(struct inode *inode);
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2009-11-12 09:35:27 +00:00
|
|
|
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
|
2022-03-14 18:12:34 -07:00
|
|
|
struct btrfs_new_inode_args *args)
|
2009-02-04 09:29:13 -05:00
|
|
|
{
|
|
|
|
|
int err;
|
|
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
if (args->default_acl) {
|
|
|
|
|
err = __btrfs_set_acl(trans, args->inode, args->default_acl,
|
|
|
|
|
ACL_TYPE_DEFAULT);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
if (args->acl) {
|
|
|
|
|
err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
if (!args->default_acl && !args->acl)
|
|
|
|
|
cache_no_acl(args->inode);
|
|
|
|
|
return btrfs_xattr_security_init(trans, args->inode, args->dir,
|
|
|
|
|
&args->dentry->d_name);
|
2009-02-04 09:29:13 -05:00
|
|
|
}
|
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
/*
|
|
|
|
|
* this does all the hard work for inserting an inline extent into
|
|
|
|
|
* the btree. The caller should have done a btrfs_drop_extents so that
|
|
|
|
|
* no overlapping inline items exist in the btree
|
|
|
|
|
*/
|
2014-05-21 13:35:51 -07:00
|
|
|
static int insert_inline_extent(struct btrfs_trans_handle *trans,
|
2021-11-16 14:03:45 -08:00
|
|
|
struct btrfs_path *path,
|
|
|
|
|
struct btrfs_inode *inode, bool extent_inserted,
|
|
|
|
|
size_t size, size_t compressed_size,
|
2011-03-28 08:30:38 +00:00
|
|
|
int compress_type,
|
2019-11-07 15:19:16 -08:00
|
|
|
struct page **compressed_pages,
|
|
|
|
|
bool update_i_size)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
{
|
2021-11-16 14:03:45 -08:00
|
|
|
struct btrfs_root *root = inode->root;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct page *page = NULL;
|
|
|
|
|
char *kaddr;
|
|
|
|
|
unsigned long ptr;
|
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
|
int ret;
|
|
|
|
|
size_t cur_size = size;
|
2019-11-07 15:19:16 -08:00
|
|
|
u64 i_size;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2019-07-27 16:51:13 +08:00
|
|
|
ASSERT((compressed_size > 0 && compressed_pages) ||
|
|
|
|
|
(compressed_size == 0 && !compressed_pages));
|
|
|
|
|
|
2011-03-28 08:30:38 +00:00
|
|
|
if (compressed_size && compressed_pages)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
cur_size = compressed_size;
|
|
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
if (!extent_inserted) {
|
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
size_t datasize;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2021-11-16 14:03:45 -08:00
|
|
|
key.objectid = btrfs_ino(inode);
|
|
|
|
|
key.offset = 0;
|
2014-06-04 18:41:45 +02:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
datasize = btrfs_file_extent_calc_inline_size(cur_size);
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
|
datasize);
|
2017-06-15 19:09:51 +02:00
|
|
|
if (ret)
|
2014-01-07 11:42:27 +00:00
|
|
|
goto fail;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
|
btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
|
|
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
|
|
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
|
|
|
|
|
2010-12-17 14:21:50 +08:00
|
|
|
if (compress_type != BTRFS_COMPRESS_NONE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
struct page *cpage;
|
|
|
|
|
int i = 0;
|
2009-01-05 21:25:51 -05:00
|
|
|
while (compressed_size > 0) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
cpage = compressed_pages[i];
|
2008-11-11 09:34:41 -05:00
|
|
|
cur_size = min_t(unsigned long, compressed_size,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
PAGE_SIZE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2022-06-27 19:48:49 +02:00
|
|
|
kaddr = kmap_local_page(cpage);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
write_extent_buffer(leaf, kaddr, ptr, cur_size);
|
2022-06-27 19:48:49 +02:00
|
|
|
kunmap_local(kaddr);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
|
|
|
|
i++;
|
|
|
|
|
ptr += cur_size;
|
|
|
|
|
compressed_size -= cur_size;
|
|
|
|
|
}
|
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei,
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
} else {
|
2021-11-16 14:03:45 -08:00
|
|
|
page = find_get_page(inode->vfs_inode.i_mapping, 0);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
2022-06-27 19:48:49 +02:00
|
|
|
kaddr = kmap_local_page(page);
|
2021-11-16 14:03:45 -08:00
|
|
|
write_extent_buffer(leaf, kaddr, ptr, size);
|
2022-06-27 19:48:49 +02:00
|
|
|
kunmap_local(kaddr);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2014-01-07 11:42:27 +00:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2020-01-17 09:02:22 -05:00
|
|
|
/*
|
|
|
|
|
* We align size to sectorsize for inline extents just for simplicity
|
|
|
|
|
* sake.
|
|
|
|
|
*/
|
2021-11-16 14:03:45 -08:00
|
|
|
ret = btrfs_inode_set_file_extent_range(inode, 0,
|
|
|
|
|
ALIGN(size, root->fs_info->sectorsize));
|
2020-01-17 09:02:22 -05:00
|
|
|
if (ret)
|
|
|
|
|
goto fail;
|
|
|
|
|
|
2009-11-12 09:34:21 +00:00
|
|
|
/*
|
2019-11-07 15:19:16 -08:00
|
|
|
* We're an inline extent, so nobody can extend the file past i_size
|
|
|
|
|
* without locking a page we already have locked.
|
2009-11-12 09:34:21 +00:00
|
|
|
*
|
2019-11-07 15:19:16 -08:00
|
|
|
* We must do any i_size and inode updates before we unlock the pages.
|
|
|
|
|
* Otherwise we could end up racing with unlink.
|
2009-11-12 09:34:21 +00:00
|
|
|
*/
|
2019-11-07 15:19:16 -08:00
|
|
|
i_size = i_size_read(&inode->vfs_inode);
|
|
|
|
|
if (update_i_size && size > i_size) {
|
|
|
|
|
i_size_write(&inode->vfs_inode, size);
|
|
|
|
|
i_size = size;
|
|
|
|
|
}
|
|
|
|
|
inode->disk_i_size = i_size;
|
2021-11-16 14:03:45 -08:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
fail:
|
2017-06-15 19:09:51 +02:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* conditionally insert an inline extent into the file. This
|
|
|
|
|
* does the checks required to make sure the data is small enough
|
|
|
|
|
* to fit as an inline extent.
|
|
|
|
|
*/
|
2021-11-16 14:03:45 -08:00
|
|
|
static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
|
|
|
|
|
size_t compressed_size,
|
2013-08-14 14:02:47 -04:00
|
|
|
int compress_type,
|
2019-11-07 15:19:16 -08:00
|
|
|
struct page **compressed_pages,
|
|
|
|
|
bool update_i_size)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
{
|
2020-11-04 11:07:32 +00:00
|
|
|
struct btrfs_drop_extents_args drop_args = { 0 };
|
2020-06-03 08:55:12 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2013-08-14 14:02:47 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
2021-11-16 14:03:45 -08:00
|
|
|
u64 data_len = (compressed_size ?: size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
int ret;
|
2014-01-07 11:42:27 +00:00
|
|
|
struct btrfs_path *path;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2021-11-16 14:03:45 -08:00
|
|
|
/*
|
|
|
|
|
* We can create an inline extent if it ends at or beyond the current
|
|
|
|
|
* i_size, is no larger than a sector (decompressed), and the (possibly
|
|
|
|
|
* compressed) data fits in a leaf and the configured maximum inline
|
|
|
|
|
* size.
|
|
|
|
|
*/
|
|
|
|
|
if (size < i_size_read(&inode->vfs_inode) ||
|
|
|
|
|
size > fs_info->sectorsize ||
|
2016-06-22 18:54:23 -04:00
|
|
|
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
|
2021-11-16 14:03:45 -08:00
|
|
|
data_len > fs_info->max_inline)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
return 1;
|
|
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
2013-08-14 14:02:47 -04:00
|
|
|
trans = btrfs_join_transaction(root);
|
2014-01-07 11:42:27 +00:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
btrfs_free_path(path);
|
2013-08-14 14:02:47 -04:00
|
|
|
return PTR_ERR(trans);
|
2014-01-07 11:42:27 +00:00
|
|
|
}
|
2020-06-03 08:55:12 +03:00
|
|
|
trans->block_rsv = &inode->block_rsv;
|
2013-08-14 14:02:47 -04:00
|
|
|
|
2020-11-04 11:07:32 +00:00
|
|
|
drop_args.path = path;
|
2021-11-16 14:03:45 -08:00
|
|
|
drop_args.start = 0;
|
|
|
|
|
drop_args.end = fs_info->sectorsize;
|
2020-11-04 11:07:32 +00:00
|
|
|
drop_args.drop_cache = true;
|
|
|
|
|
drop_args.replace_extent = true;
|
2021-11-16 14:03:45 -08:00
|
|
|
drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
|
2020-11-04 11:07:32 +00:00
|
|
|
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
|
2013-08-14 14:02:47 -04:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-08-14 14:02:47 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2021-11-16 14:03:45 -08:00
|
|
|
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
|
|
|
|
|
size, compressed_size, compress_type,
|
2019-11-07 15:19:16 -08:00
|
|
|
compressed_pages, update_i_size);
|
2012-05-23 16:10:14 -04:00
|
|
|
if (ret && ret != -ENOSPC) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-08-14 14:02:47 -04:00
|
|
|
goto out;
|
2012-05-23 16:10:14 -04:00
|
|
|
} else if (ret == -ENOSPC) {
|
2013-08-14 14:02:47 -04:00
|
|
|
ret = 1;
|
|
|
|
|
goto out;
|
2012-03-12 16:03:00 +01:00
|
|
|
}
|
2012-05-23 16:10:14 -04:00
|
|
|
|
2021-11-16 14:03:45 -08:00
|
|
|
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
if (ret && ret != -ENOSPC) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out;
|
|
|
|
|
} else if (ret == -ENOSPC) {
|
|
|
|
|
ret = 1;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: reset last_reflink_trans after fsyncing inode
When an inode has a last_reflink_trans matching the current transaction,
we have to take special care when logging its checksums in order to
avoid getting checksum items with overlapping ranges in a log tree,
which could result in missing checksums after log replay (more on that
in the changelogs of commit 40e046acbd2f36 ("Btrfs: fix missing data
checksums after replaying a log tree") and commit e289f03ea79bbc ("btrfs:
fix corrupt log due to concurrent fsync of inodes with shared extents")).
We also need to make sure a full fsync will copy all old file extent
items it finds in modified leaves, because they might have been copied
from some other inode.
However once we fsync an inode, we don't need to keep paying the price of
that extra special care in future fsyncs done in the same transaction,
unless the inode is used for another reflink operation or the full sync
flag is set on it (truncate, failure to allocate extent maps for holes,
and other exceptional and infrequent cases).
So after we fsync an inode reset its last_unlink_trans to zero. In case
another reflink happens, we continue to update the last_reflink_trans of
the inode, just as before. Also set last_reflink_trans to the generation
of the last transaction that modified the inode whenever we need to set
the full sync flag on the inode, just like when we need to load an inode
from disk after eviction.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-02-17 12:12:06 +00:00
|
|
|
btrfs_set_inode_full_sync(inode);
|
2013-08-14 14:02:47 -04:00
|
|
|
out:
|
2015-09-08 17:25:56 +08:00
|
|
|
/*
|
|
|
|
|
* Don't forget to free the reserved space, as for inlined extent
|
|
|
|
|
* it won't count as data extent, free them directly here.
|
|
|
|
|
* And at reserve time, it's always aligned to page size, so
|
|
|
|
|
* just free one page here.
|
|
|
|
|
*/
|
2020-06-03 08:55:12 +03:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
|
2014-01-07 11:42:27 +00:00
|
|
|
btrfs_free_path(path);
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2013-08-14 14:02:47 -04:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
struct async_extent {
|
|
|
|
|
u64 start;
|
|
|
|
|
u64 ram_size;
|
|
|
|
|
u64 compressed_size;
|
|
|
|
|
struct page **pages;
|
|
|
|
|
unsigned long nr_pages;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type;
|
2008-11-06 22:02:51 -05:00
|
|
|
struct list_head list;
|
|
|
|
|
};
|
|
|
|
|
|
2019-03-12 17:20:24 +02:00
|
|
|
struct async_chunk {
|
2008-11-06 22:02:51 -05:00
|
|
|
struct inode *inode;
|
|
|
|
|
struct page *locked_page;
|
|
|
|
|
u64 start;
|
|
|
|
|
u64 end;
|
2022-07-14 11:07:16 -07:00
|
|
|
blk_opf_t write_flags;
|
2008-11-06 22:02:51 -05:00
|
|
|
struct list_head extents;
|
2019-07-10 12:28:17 -07:00
|
|
|
struct cgroup_subsys_state *blkcg_css;
|
2008-11-06 22:02:51 -05:00
|
|
|
struct btrfs_work work;
|
2021-09-27 15:21:45 +08:00
|
|
|
struct async_cow *async_cow;
|
2008-11-06 22:02:51 -05:00
|
|
|
};
|
|
|
|
|
|
2019-03-12 17:20:24 +02:00
|
|
|
struct async_cow {
|
|
|
|
|
atomic_t num_chunks;
|
|
|
|
|
struct async_chunk chunks[];
|
2008-11-06 22:02:51 -05:00
|
|
|
};
|
|
|
|
|
|
2019-03-12 17:20:24 +02:00
|
|
|
static noinline int add_async_extent(struct async_chunk *cow,
|
2008-11-06 22:02:51 -05:00
|
|
|
u64 start, u64 ram_size,
|
|
|
|
|
u64 compressed_size,
|
|
|
|
|
struct page **pages,
|
2010-12-17 14:21:50 +08:00
|
|
|
unsigned long nr_pages,
|
|
|
|
|
int compress_type)
|
2008-11-06 22:02:51 -05:00
|
|
|
{
|
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
|
|
|
|
|
|
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
|
2012-03-12 16:03:00 +01:00
|
|
|
BUG_ON(!async_extent); /* -ENOMEM */
|
2008-11-06 22:02:51 -05:00
|
|
|
async_extent->start = start;
|
|
|
|
|
async_extent->ram_size = ram_size;
|
|
|
|
|
async_extent->compressed_size = compressed_size;
|
|
|
|
|
async_extent->pages = pages;
|
|
|
|
|
async_extent->nr_pages = nr_pages;
|
2010-12-17 14:21:50 +08:00
|
|
|
async_extent->compress_type = compress_type;
|
2008-11-06 22:02:51 -05:00
|
|
|
list_add_tail(&async_extent->list, &cow->extents);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: inode: Don't compress if NODATASUM or NODATACOW set
As btrfs(5) specified:
Note
If nodatacow or nodatasum are enabled, compression is disabled.
If NODATASUM or NODATACOW set, we should not compress the extent.
Normally NODATACOW is detected properly in run_delalloc_range() so
compression won't happen for NODATACOW.
However for NODATASUM we don't have any check, and it can cause
compressed extent without csum pretty easily, just by:
mkfs.btrfs -f $dev
mount $dev $mnt -o nodatasum
touch $mnt/foobar
mount -o remount,datasum,compress $mnt
xfs_io -f -c "pwrite 0 128K" $mnt/foobar
And in fact, we have a bug report about corrupted compressed extent
without proper data checksum so even RAID1 can't recover the corruption.
(https://bugzilla.kernel.org/show_bug.cgi?id=199707)
Running compression without proper checksum could cause more damage when
corruption happens, as compressed data could make the whole extent
unreadable, so there is no need to allow compression for
NODATACSUM.
The fix will refactor the inode compression check into two parts:
- inode_can_compress()
As the hard requirement, checked at btrfs_run_delalloc_range(), so no
compression will happen for NODATASUM inode at all.
- inode_need_compress()
As the soft requirement, checked at btrfs_run_delalloc_range() and
compress_file_range().
Reported-by: James Harvey <jamespharvey20@gmail.com>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-01 05:12:46 +00:00
|
|
|
/*
|
|
|
|
|
* Check if the inode needs to be submitted to compression, based on mount
|
|
|
|
|
* options, defragmentation, properties or heuristics.
|
|
|
|
|
*/
|
2020-06-03 08:55:27 +03:00
|
|
|
static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
|
|
|
|
|
u64 end)
|
2014-07-17 11:44:09 +08:00
|
|
|
{
|
2020-06-03 08:55:27 +03:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
2014-07-17 11:44:09 +08:00
|
|
|
|
2022-04-15 16:04:05 +08:00
|
|
|
if (!btrfs_inode_can_compress(inode)) {
|
btrfs: inode: Don't compress if NODATASUM or NODATACOW set
As btrfs(5) specified:
Note
If nodatacow or nodatasum are enabled, compression is disabled.
If NODATASUM or NODATACOW set, we should not compress the extent.
Normally NODATACOW is detected properly in run_delalloc_range() so
compression won't happen for NODATACOW.
However for NODATASUM we don't have any check, and it can cause
compressed extent without csum pretty easily, just by:
mkfs.btrfs -f $dev
mount $dev $mnt -o nodatasum
touch $mnt/foobar
mount -o remount,datasum,compress $mnt
xfs_io -f -c "pwrite 0 128K" $mnt/foobar
And in fact, we have a bug report about corrupted compressed extent
without proper data checksum so even RAID1 can't recover the corruption.
(https://bugzilla.kernel.org/show_bug.cgi?id=199707)
Running compression without proper checksum could cause more damage when
corruption happens, as compressed data could make the whole extent
unreadable, so there is no need to allow compression for
NODATACSUM.
The fix will refactor the inode compression check into two parts:
- inode_can_compress()
As the hard requirement, checked at btrfs_run_delalloc_range(), so no
compression will happen for NODATASUM inode at all.
- inode_need_compress()
As the soft requirement, checked at btrfs_run_delalloc_range() and
compress_file_range().
Reported-by: James Harvey <jamespharvey20@gmail.com>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-01 05:12:46 +00:00
|
|
|
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
|
|
|
|
|
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
|
2020-06-03 08:55:27 +03:00
|
|
|
btrfs_ino(inode));
|
btrfs: inode: Don't compress if NODATASUM or NODATACOW set
As btrfs(5) specified:
Note
If nodatacow or nodatasum are enabled, compression is disabled.
If NODATASUM or NODATACOW set, we should not compress the extent.
Normally NODATACOW is detected properly in run_delalloc_range() so
compression won't happen for NODATACOW.
However for NODATASUM we don't have any check, and it can cause
compressed extent without csum pretty easily, just by:
mkfs.btrfs -f $dev
mount $dev $mnt -o nodatasum
touch $mnt/foobar
mount -o remount,datasum,compress $mnt
xfs_io -f -c "pwrite 0 128K" $mnt/foobar
And in fact, we have a bug report about corrupted compressed extent
without proper data checksum so even RAID1 can't recover the corruption.
(https://bugzilla.kernel.org/show_bug.cgi?id=199707)
Running compression without proper checksum could cause more damage when
corruption happens, as compressed data could make the whole extent
unreadable, so there is no need to allow compression for
NODATACSUM.
The fix will refactor the inode compression check into two parts:
- inode_can_compress()
As the hard requirement, checked at btrfs_run_delalloc_range(), so no
compression will happen for NODATASUM inode at all.
- inode_need_compress()
As the soft requirement, checked at btrfs_run_delalloc_range() and
compress_file_range().
Reported-by: James Harvey <jamespharvey20@gmail.com>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-01 05:12:46 +00:00
|
|
|
return 0;
|
|
|
|
|
}
|
btrfs: subpage: only allow compression if the range is fully page aligned
For compressed write, we use a mechanism called async COW, which unlike
regular run_delalloc_cow() or cow_file_range() will also unlock the
first page.
This mechanism allows us to continue handling next ranges, without
waiting for the time consuming compression.
But this has a problem for subpage case, as we could have the following
delalloc range for a page:
0 32K 64K
| |///////| |///////|
\- A \- B
In the above case, if we pass both ranges to cow_file_range_async(),
both range A and range B will try to unlock the full page [0, 64K).
And which one finishes later than the other one will try to do other
page operations like end_page_writeback() on a unlocked page, triggering
VM layer BUG_ON().
To make subpage compression work at least partially, here we add another
restriction for it, only allow compression if the delalloc range is
fully page aligned.
By that, async extent is always ensured to unlock the first page
exclusively, just like it used to be for regular sectorsize.
In theory, we only need to make sure the delalloc range fully covers its
first page, but the tail page will be locked anyway, blocking later
writeback until the compression finishes.
Thus here we choose to make sure the range is fully page aligned before
doing the compression.
In the future, we could optimize the situation by properly increasing
subpage::writers number for the locked page, but that also means we need
to change how we run delalloc range of page.
(Instead of running each delalloc range we hit, we need to find and lock
all delalloc ranges covering the page, then run each of them).
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-27 15:22:08 +08:00
|
|
|
/*
|
|
|
|
|
* Special check for subpage.
|
|
|
|
|
*
|
|
|
|
|
* We lock the full page then run each delalloc range in the page, thus
|
|
|
|
|
* for the following case, we will hit some subpage specific corner case:
|
|
|
|
|
*
|
|
|
|
|
* 0 32K 64K
|
|
|
|
|
* | |///////| |///////|
|
|
|
|
|
* \- A \- B
|
|
|
|
|
*
|
|
|
|
|
* In above case, both range A and range B will try to unlock the full
|
|
|
|
|
* page [0, 64K), causing the one finished later will have page
|
|
|
|
|
* unlocked already, triggering various page lock requirement BUG_ON()s.
|
|
|
|
|
*
|
|
|
|
|
* So here we add an artificial limit that subpage compression can only
|
|
|
|
|
* if the range is fully page aligned.
|
|
|
|
|
*
|
|
|
|
|
* In theory we only need to ensure the first page is fully covered, but
|
|
|
|
|
* the tailing partial page will be locked until the full compression
|
|
|
|
|
* finishes, delaying the write of other range.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
|
|
|
|
|
* first to prevent any submitted async extent to unlock the full page.
|
|
|
|
|
* By this, we can ensure for subpage case that only the last async_cow
|
|
|
|
|
* will unlock the full page.
|
|
|
|
|
*/
|
|
|
|
|
if (fs_info->sectorsize < PAGE_SIZE) {
|
2022-05-26 22:35:40 +08:00
|
|
|
if (!PAGE_ALIGNED(start) ||
|
|
|
|
|
!PAGE_ALIGNED(end + 1))
|
btrfs: subpage: only allow compression if the range is fully page aligned
For compressed write, we use a mechanism called async COW, which unlike
regular run_delalloc_cow() or cow_file_range() will also unlock the
first page.
This mechanism allows us to continue handling next ranges, without
waiting for the time consuming compression.
But this has a problem for subpage case, as we could have the following
delalloc range for a page:
0 32K 64K
| |///////| |///////|
\- A \- B
In the above case, if we pass both ranges to cow_file_range_async(),
both range A and range B will try to unlock the full page [0, 64K).
And which one finishes later than the other one will try to do other
page operations like end_page_writeback() on a unlocked page, triggering
VM layer BUG_ON().
To make subpage compression work at least partially, here we add another
restriction for it, only allow compression if the delalloc range is
fully page aligned.
By that, async extent is always ensured to unlock the first page
exclusively, just like it used to be for regular sectorsize.
In theory, we only need to make sure the delalloc range fully covers its
first page, but the tail page will be locked anyway, blocking later
writeback until the compression finishes.
Thus here we choose to make sure the range is fully page aligned before
doing the compression.
In the future, we could optimize the situation by properly increasing
subpage::writers number for the locked page, but that also means we need
to change how we run delalloc range of page.
(Instead of running each delalloc range we hit, we need to find and lock
all delalloc ranges covering the page, then run each of them).
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-27 15:22:08 +08:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2014-07-17 11:44:09 +08:00
|
|
|
/* force compress */
|
2016-06-22 18:54:23 -04:00
|
|
|
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
|
2014-07-17 11:44:09 +08:00
|
|
|
return 1;
|
2017-07-17 19:41:31 +02:00
|
|
|
/* defrag ioctl */
|
2020-06-03 08:55:27 +03:00
|
|
|
if (inode->defrag_compress)
|
2017-07-17 19:41:31 +02:00
|
|
|
return 1;
|
2014-07-17 11:44:09 +08:00
|
|
|
/* bad compression ratios */
|
2020-06-03 08:55:27 +03:00
|
|
|
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
|
2014-07-17 11:44:09 +08:00
|
|
|
return 0;
|
2016-06-22 18:54:23 -04:00
|
|
|
if (btrfs_test_opt(fs_info, COMPRESS) ||
|
2020-06-03 08:55:27 +03:00
|
|
|
inode->flags & BTRFS_INODE_COMPRESS ||
|
|
|
|
|
inode->prop_compress)
|
|
|
|
|
return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
|
2014-07-17 11:44:09 +08:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 13:50:43 +02:00
|
|
|
static inline void inode_should_defrag(struct btrfs_inode *inode,
|
2022-02-13 15:42:33 +08:00
|
|
|
u64 start, u64 end, u64 num_bytes, u32 small_write)
|
2016-12-19 19:09:06 +08:00
|
|
|
{
|
|
|
|
|
/* If this is a small write inside eof, kick off a defrag */
|
|
|
|
|
if (num_bytes < small_write &&
|
2017-02-20 13:50:43 +02:00
|
|
|
(start > 0 || end + 1 < inode->disk_i_size))
|
2022-02-13 15:42:33 +08:00
|
|
|
btrfs_add_inode_defrag(NULL, inode, small_write);
|
2016-12-19 19:09:06 +08:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
2008-11-06 22:02:51 -05:00
|
|
|
* we create compressed extents in two phases. The first
|
|
|
|
|
* phase compresses a range of pages that have already been
|
|
|
|
|
* locked (both pages and state bits are locked).
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
*
|
2008-11-06 22:02:51 -05:00
|
|
|
* This is done inside an ordered work queue, and the compression
|
|
|
|
|
* is spread across many cpus. The actual IO submission is step
|
|
|
|
|
* two, and the ordered work queue takes care of making sure that
|
|
|
|
|
* happens in the same order things were put onto the queue by
|
|
|
|
|
* writepages and friends.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
*
|
2008-11-06 22:02:51 -05:00
|
|
|
* If this code finds it can't get good compression, it puts an
|
|
|
|
|
* entry onto the work queue to write the uncompressed bytes. This
|
|
|
|
|
* makes sure that both compressed inodes and uncompressed inodes
|
2012-07-25 18:12:06 +03:00
|
|
|
* are written in the same order that the flusher thread sent them
|
|
|
|
|
* down.
|
2008-09-29 15:18:18 -04:00
|
|
|
*/
|
2019-07-17 14:41:44 +03:00
|
|
|
static noinline int compress_file_range(struct async_chunk *async_chunk)
|
2007-08-27 16:49:44 -04:00
|
|
|
{
|
2019-03-12 17:20:27 +02:00
|
|
|
struct inode *inode = async_chunk->inode;
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
u64 blocksize = fs_info->sectorsize;
|
2019-03-12 17:20:27 +02:00
|
|
|
u64 start = async_chunk->start;
|
|
|
|
|
u64 end = async_chunk->end;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
u64 actual_end;
|
btrfs: save i_size to avoid double evaluation of i_size_read in compress_file_range
We hit a regression while rolling out 5.2 internally where we were
hitting the following panic
kernel BUG at mm/page-writeback.c:2659!
RIP: 0010:clear_page_dirty_for_io+0xe6/0x1f0
Call Trace:
__process_pages_contig+0x25a/0x350
? extent_clear_unlock_delalloc+0x43/0x70
submit_compressed_extents+0x359/0x4d0
normal_work_helper+0x15a/0x330
process_one_work+0x1f5/0x3f0
worker_thread+0x2d/0x3d0
? rescuer_thread+0x340/0x340
kthread+0x111/0x130
? kthread_create_on_node+0x60/0x60
ret_from_fork+0x1f/0x30
This is happening because the page is not locked when doing
clear_page_dirty_for_io. Looking at the core dump it was because our
async_extent had a ram_size of 24576 but our async_chunk range only
spanned 20480, so we had a whole extra page in our ram_size for our
async_extent.
This happened because we try not to compress pages outside of our
i_size, however a cleanup patch changed us to do
actual_end = min_t(u64, i_size_read(inode), end + 1);
which is problematic because i_size_read() can evaluate to different
values in between checking and assigning. So either an expanding
truncate or a fallocate could increase our i_size while we're doing
writeout and actual_end would end up being past the range we have
locked.
I confirmed this was what was happening by installing a debug kernel
that had
actual_end = min_t(u64, i_size_read(inode), end + 1);
if (actual_end > end + 1) {
printk(KERN_ERR "KABOOM\n");
actual_end = end + 1;
}
and installing it onto 500 boxes of the tier that had been seeing the
problem regularly. Last night I got my debug message and no panic,
confirming what I expected.
[ dsterba: the assembly confirms a tiny race window:
mov 0x20(%rsp),%rax
cmp %rax,0x48(%r15) # read
movl $0x0,0x18(%rsp)
mov %rax,%r12
mov %r14,%rax
cmovbe 0x48(%r15),%r12 # eval
Where r15 is inode and 0x48 is offset of i_size.
The original fix was to revert 62b37622718c that would do an
intermediate assignment and this would also avoid the doulble
evaluation but is not future-proof, should the compiler merge the
stores and call i_size_read anyway.
There's a patch adding READ_ONCE to i_size_read but that's not being
applied at the moment and we need to fix the bug. Instead, emulate
READ_ONCE by two barrier()s that's what effectively happens. The
assembly confirms single evaluation:
mov 0x48(%rbp),%rax # read once
mov 0x20(%rsp),%rcx
mov $0x20,%edx
cmp %rax,%rcx
cmovbe %rcx,%rax
mov %rax,(%rsp)
mov %rax,%rcx
mov %r14,%rax
Where 0x48(%rbp) is inode->i_size stored to %eax.
]
Fixes: 62b37622718c ("btrfs: Remove isize local variable in compress_file_range")
CC: stable@vger.kernel.org # v5.1+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ changelog updated ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-11 09:03:54 -04:00
|
|
|
u64 i_size;
|
2008-07-17 12:53:50 -04:00
|
|
|
int ret = 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
struct page **pages = NULL;
|
|
|
|
|
unsigned long nr_pages;
|
|
|
|
|
unsigned long total_compressed = 0;
|
|
|
|
|
unsigned long total_in = 0;
|
|
|
|
|
int i;
|
|
|
|
|
int will_compress;
|
2016-06-22 18:54:23 -04:00
|
|
|
int compress_type = fs_info->compress_type;
|
2019-07-17 14:41:44 +03:00
|
|
|
int compressed_extents = 0;
|
2013-03-26 13:07:00 -04:00
|
|
|
int redirty = 0;
|
2007-08-27 16:49:44 -04:00
|
|
|
|
2017-02-20 13:50:43 +02:00
|
|
|
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
|
|
|
|
|
SZ_16K);
|
2011-05-24 15:35:30 -04:00
|
|
|
|
btrfs: save i_size to avoid double evaluation of i_size_read in compress_file_range
We hit a regression while rolling out 5.2 internally where we were
hitting the following panic
kernel BUG at mm/page-writeback.c:2659!
RIP: 0010:clear_page_dirty_for_io+0xe6/0x1f0
Call Trace:
__process_pages_contig+0x25a/0x350
? extent_clear_unlock_delalloc+0x43/0x70
submit_compressed_extents+0x359/0x4d0
normal_work_helper+0x15a/0x330
process_one_work+0x1f5/0x3f0
worker_thread+0x2d/0x3d0
? rescuer_thread+0x340/0x340
kthread+0x111/0x130
? kthread_create_on_node+0x60/0x60
ret_from_fork+0x1f/0x30
This is happening because the page is not locked when doing
clear_page_dirty_for_io. Looking at the core dump it was because our
async_extent had a ram_size of 24576 but our async_chunk range only
spanned 20480, so we had a whole extra page in our ram_size for our
async_extent.
This happened because we try not to compress pages outside of our
i_size, however a cleanup patch changed us to do
actual_end = min_t(u64, i_size_read(inode), end + 1);
which is problematic because i_size_read() can evaluate to different
values in between checking and assigning. So either an expanding
truncate or a fallocate could increase our i_size while we're doing
writeout and actual_end would end up being past the range we have
locked.
I confirmed this was what was happening by installing a debug kernel
that had
actual_end = min_t(u64, i_size_read(inode), end + 1);
if (actual_end > end + 1) {
printk(KERN_ERR "KABOOM\n");
actual_end = end + 1;
}
and installing it onto 500 boxes of the tier that had been seeing the
problem regularly. Last night I got my debug message and no panic,
confirming what I expected.
[ dsterba: the assembly confirms a tiny race window:
mov 0x20(%rsp),%rax
cmp %rax,0x48(%r15) # read
movl $0x0,0x18(%rsp)
mov %rax,%r12
mov %r14,%rax
cmovbe 0x48(%r15),%r12 # eval
Where r15 is inode and 0x48 is offset of i_size.
The original fix was to revert 62b37622718c that would do an
intermediate assignment and this would also avoid the doulble
evaluation but is not future-proof, should the compiler merge the
stores and call i_size_read anyway.
There's a patch adding READ_ONCE to i_size_read but that's not being
applied at the moment and we need to fix the bug. Instead, emulate
READ_ONCE by two barrier()s that's what effectively happens. The
assembly confirms single evaluation:
mov 0x48(%rbp),%rax # read once
mov 0x20(%rsp),%rcx
mov $0x20,%edx
cmp %rax,%rcx
cmovbe %rcx,%rax
mov %rax,(%rsp)
mov %rax,%rcx
mov %r14,%rax
Where 0x48(%rbp) is inode->i_size stored to %eax.
]
Fixes: 62b37622718c ("btrfs: Remove isize local variable in compress_file_range")
CC: stable@vger.kernel.org # v5.1+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ changelog updated ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-11 09:03:54 -04:00
|
|
|
/*
|
|
|
|
|
* We need to save i_size before now because it could change in between
|
|
|
|
|
* us evaluating the size and assigning it. This is because we lock and
|
|
|
|
|
* unlock the page in truncate and fallocate, and then modify the i_size
|
|
|
|
|
* later on.
|
|
|
|
|
*
|
|
|
|
|
* The barriers are to emulate READ_ONCE, remove that once i_size_read
|
|
|
|
|
* does that for us.
|
|
|
|
|
*/
|
|
|
|
|
barrier();
|
|
|
|
|
i_size = i_size_read(inode);
|
|
|
|
|
barrier();
|
|
|
|
|
actual_end = min_t(u64, i_size, end + 1);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
again:
|
|
|
|
|
will_compress = 0;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
|
2017-02-14 19:36:54 +01:00
|
|
|
nr_pages = min_t(unsigned long, nr_pages,
|
|
|
|
|
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
|
2007-12-17 20:14:01 -05:00
|
|
|
|
2009-02-04 09:31:06 -05:00
|
|
|
/*
|
|
|
|
|
* we don't want to send crud past the end of i_size through
|
|
|
|
|
* compression, that's just a waste of CPU time. So, if the
|
|
|
|
|
* end of the file is before the start of our current
|
|
|
|
|
* requested range of bytes, we bail out to the uncompressed
|
|
|
|
|
* cleanup code that can deal with all of this.
|
|
|
|
|
*
|
|
|
|
|
* It isn't really the fastest way to fix things, but this is a
|
|
|
|
|
* very uncommon corner.
|
|
|
|
|
*/
|
|
|
|
|
if (actual_end <= start)
|
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
total_compressed = actual_end - start;
|
|
|
|
|
|
2014-10-07 18:44:35 -04:00
|
|
|
/*
|
btrfs: subpage: only allow compression if the range is fully page aligned
For compressed write, we use a mechanism called async COW, which unlike
regular run_delalloc_cow() or cow_file_range() will also unlock the
first page.
This mechanism allows us to continue handling next ranges, without
waiting for the time consuming compression.
But this has a problem for subpage case, as we could have the following
delalloc range for a page:
0 32K 64K
| |///////| |///////|
\- A \- B
In the above case, if we pass both ranges to cow_file_range_async(),
both range A and range B will try to unlock the full page [0, 64K).
And which one finishes later than the other one will try to do other
page operations like end_page_writeback() on a unlocked page, triggering
VM layer BUG_ON().
To make subpage compression work at least partially, here we add another
restriction for it, only allow compression if the delalloc range is
fully page aligned.
By that, async extent is always ensured to unlock the first page
exclusively, just like it used to be for regular sectorsize.
In theory, we only need to make sure the delalloc range fully covers its
first page, but the tail page will be locked anyway, blocking later
writeback until the compression finishes.
Thus here we choose to make sure the range is fully page aligned before
doing the compression.
In the future, we could optimize the situation by properly increasing
subpage::writers number for the locked page, but that also means we need
to change how we run delalloc range of page.
(Instead of running each delalloc range we hit, we need to find and lock
all delalloc ranges covering the page, then run each of them).
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-27 15:22:08 +08:00
|
|
|
* Skip compression for a small file range(<=blocksize) that
|
2016-05-19 21:18:45 -04:00
|
|
|
* isn't an inline extent, since it doesn't save disk space at all.
|
2014-10-07 18:44:35 -04:00
|
|
|
*/
|
|
|
|
|
if (total_compressed <= blocksize &&
|
|
|
|
|
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
|
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
|
btrfs: subpage: only allow compression if the range is fully page aligned
For compressed write, we use a mechanism called async COW, which unlike
regular run_delalloc_cow() or cow_file_range() will also unlock the
first page.
This mechanism allows us to continue handling next ranges, without
waiting for the time consuming compression.
But this has a problem for subpage case, as we could have the following
delalloc range for a page:
0 32K 64K
| |///////| |///////|
\- A \- B
In the above case, if we pass both ranges to cow_file_range_async(),
both range A and range B will try to unlock the full page [0, 64K).
And which one finishes later than the other one will try to do other
page operations like end_page_writeback() on a unlocked page, triggering
VM layer BUG_ON().
To make subpage compression work at least partially, here we add another
restriction for it, only allow compression if the delalloc range is
fully page aligned.
By that, async extent is always ensured to unlock the first page
exclusively, just like it used to be for regular sectorsize.
In theory, we only need to make sure the delalloc range fully covers its
first page, but the tail page will be locked anyway, blocking later
writeback until the compression finishes.
Thus here we choose to make sure the range is fully page aligned before
doing the compression.
In the future, we could optimize the situation by properly increasing
subpage::writers number for the locked page, but that also means we need
to change how we run delalloc range of page.
(Instead of running each delalloc range we hit, we need to find and lock
all delalloc ranges covering the page, then run each of them).
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-27 15:22:08 +08:00
|
|
|
/*
|
|
|
|
|
* For subpage case, we require full page alignment for the sector
|
|
|
|
|
* aligned range.
|
|
|
|
|
* Thus we must also check against @actual_end, not just @end.
|
|
|
|
|
*/
|
|
|
|
|
if (blocksize < PAGE_SIZE) {
|
2022-05-26 22:35:40 +08:00
|
|
|
if (!PAGE_ALIGNED(start) ||
|
|
|
|
|
!PAGE_ALIGNED(round_up(actual_end, blocksize)))
|
btrfs: subpage: only allow compression if the range is fully page aligned
For compressed write, we use a mechanism called async COW, which unlike
regular run_delalloc_cow() or cow_file_range() will also unlock the
first page.
This mechanism allows us to continue handling next ranges, without
waiting for the time consuming compression.
But this has a problem for subpage case, as we could have the following
delalloc range for a page:
0 32K 64K
| |///////| |///////|
\- A \- B
In the above case, if we pass both ranges to cow_file_range_async(),
both range A and range B will try to unlock the full page [0, 64K).
And which one finishes later than the other one will try to do other
page operations like end_page_writeback() on a unlocked page, triggering
VM layer BUG_ON().
To make subpage compression work at least partially, here we add another
restriction for it, only allow compression if the delalloc range is
fully page aligned.
By that, async extent is always ensured to unlock the first page
exclusively, just like it used to be for regular sectorsize.
In theory, we only need to make sure the delalloc range fully covers its
first page, but the tail page will be locked anyway, blocking later
writeback until the compression finishes.
Thus here we choose to make sure the range is fully page aligned before
doing the compression.
In the future, we could optimize the situation by properly increasing
subpage::writers number for the locked page, but that also means we need
to change how we run delalloc range of page.
(Instead of running each delalloc range we hit, we need to find and lock
all delalloc ranges covering the page, then run each of them).
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-27 15:22:08 +08:00
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-14 19:36:54 +01:00
|
|
|
total_compressed = min_t(unsigned long, total_compressed,
|
|
|
|
|
BTRFS_MAX_UNCOMPRESSED);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
total_in = 0;
|
|
|
|
|
ret = 0;
|
2007-10-15 16:15:53 -04:00
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
/*
|
|
|
|
|
* we do compression for mount -o compress and when the
|
|
|
|
|
* inode has not been flagged as nocompress. This flag can
|
|
|
|
|
* change at any time if we discover bad compression ratios.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
*/
|
2021-08-25 13:41:42 +08:00
|
|
|
if (inode_need_compress(BTRFS_I(inode), start, end)) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
WARN_ON(pages);
|
2015-02-20 18:00:26 +01:00
|
|
|
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
|
2011-09-08 10:22:01 +08:00
|
|
|
if (!pages) {
|
|
|
|
|
/* just bail out to the uncompressed code */
|
2018-10-13 00:37:25 +01:00
|
|
|
nr_pages = 0;
|
2011-09-08 10:22:01 +08:00
|
|
|
goto cont;
|
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2017-07-17 19:41:31 +02:00
|
|
|
if (BTRFS_I(inode)->defrag_compress)
|
|
|
|
|
compress_type = BTRFS_I(inode)->defrag_compress;
|
|
|
|
|
else if (BTRFS_I(inode)->prop_compress)
|
2017-07-17 19:17:20 +02:00
|
|
|
compress_type = BTRFS_I(inode)->prop_compress;
|
2010-12-17 14:21:50 +08:00
|
|
|
|
2013-03-26 13:07:00 -04:00
|
|
|
/*
|
|
|
|
|
* we need to call clear_page_dirty_for_io on each
|
|
|
|
|
* page in the range. Otherwise applications with the file
|
|
|
|
|
* mmap'd can wander in and change the page contents while
|
|
|
|
|
* we are compressing them.
|
|
|
|
|
*
|
|
|
|
|
* If the compression fails for any reason, we set the pages
|
|
|
|
|
* dirty again later on.
|
2017-10-24 01:29:48 +03:00
|
|
|
*
|
|
|
|
|
* Note that the remaining part is redirtied, the start pointer
|
|
|
|
|
* has moved, the end is the original one.
|
2013-03-26 13:07:00 -04:00
|
|
|
*/
|
2017-10-24 01:29:48 +03:00
|
|
|
if (!redirty) {
|
|
|
|
|
extent_range_clear_dirty_for_io(inode, start, end);
|
|
|
|
|
redirty = 1;
|
|
|
|
|
}
|
2017-09-15 17:36:57 +02:00
|
|
|
|
|
|
|
|
/* Compression level is applied here and only here */
|
|
|
|
|
ret = btrfs_compress_pages(
|
|
|
|
|
compress_type | (fs_info->compress_level << 4),
|
2010-12-17 14:21:50 +08:00
|
|
|
inode->i_mapping, start,
|
2017-02-14 19:04:07 +01:00
|
|
|
pages,
|
2017-02-14 19:04:07 +01:00
|
|
|
&nr_pages,
|
2010-12-17 14:21:50 +08:00
|
|
|
&total_in,
|
2017-02-14 19:45:05 +01:00
|
|
|
&total_compressed);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
|
|
|
|
if (!ret) {
|
2018-12-05 15:23:03 +01:00
|
|
|
unsigned long offset = offset_in_page(total_compressed);
|
2017-02-14 19:04:07 +01:00
|
|
|
struct page *page = pages[nr_pages - 1];
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
|
|
|
|
/* zero the tail end of the last page, we might be
|
|
|
|
|
* sending it down to disk
|
|
|
|
|
*/
|
btrfs: use memzero_page() instead of open coded kmap pattern
There are many places where kmap/memset/kunmap patterns occur.
Use the newly lifted memzero_page() to eliminate direct uses of kmap and
leverage the new core functions use of kmap_local_page().
The development of this patch was aided by the following coccinelle
script:
// <smpl>
// SPDX-License-Identifier: GPL-2.0-only
// Find kmap/memset/kunmap pattern and replace with memset*page calls
//
// NOTE: Offsets and other expressions may be more complex than what the script
// will automatically generate. Therefore a catchall rule is provided to find
// the pattern which then must be evaluated by hand.
//
// Confidence: Low
// Copyright: (C) 2021 Intel Corporation
// URL: http://coccinelle.lip6.fr/
// Comments:
// Options:
//
// Then the memset pattern
//
@ memset_rule1 @
expression page, V, L, Off;
identifier ptr;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
-memset(ptr, 0, L);
+memzero_page(page, 0, L);
|
-memset(ptr + Off, 0, L);
+memzero_page(page, Off, L);
|
-memset(ptr, V, L);
+memset_page(page, V, 0, L);
|
-memset(ptr + Off, V, L);
+memset_page(page, V, Off, L);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule1
@
identifier memset_rule1.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
//
// Catch all
//
@ memset_rule2 @
expression page;
identifier ptr;
expression GenTo, GenSize, GenValue;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
//
// Some call sites have complex expressions within the memset/memcpy
// The follow are catch alls which need to be evaluated by hand.
//
-memset(GenTo, 0, GenSize);
+memzero_pageExtra(page, GenTo, GenSize);
|
-memset(GenTo, GenValue, GenSize);
+memset_pageExtra(page, GenValue, GenTo, GenSize);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule2
@
identifier memset_rule2.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
// </smpl>
Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-04 18:40:07 -07:00
|
|
|
if (offset)
|
|
|
|
|
memzero_page(page, offset, PAGE_SIZE - offset);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
will_compress = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-09-08 10:22:01 +08:00
|
|
|
cont:
|
btrfs: subpage: disable inline extent creation
[BUG]
When running the following fsx command (extracted from generic/127) on
subpage filesystem, it can create inline extent with regular extents:
fsx -q -l 262144 -o 65536 -S 191110531 -N 9057 -R -W $mnt/file > /tmp/fsx
The offending extent would look like:
item 9 key (257 INODE_REF 256) itemoff 15703 itemsize 14
index 2 namelen 4 name: file
item 10 key (257 EXTENT_DATA 0) itemoff 14975 itemsize 728
generation 7 type 0 (inline)
inline extent data size 707 ram_bytes 707 compression 0 (none)
item 11 key (257 EXTENT_DATA 4096) itemoff 14922 itemsize 53
generation 7 type 2 (prealloc)
prealloc data disk byte 102346752 nr 4096
prealloc data offset 0 nr 4096
[CAUSE]
For subpage filesystem, the writeback is triggered in page units, which
means, even if we just want to writeback range [16K, 20K) for 64K page
system, we will still try to writeback any dirty sector of range [0, 64K).
This is never a problem if sectorsize == PAGE_SIZE, but for subpage,
this can cause unexpected problems.
For above test case, the last several operations from fsx are:
9055 trunc from 0x40000 to 0x2c3
9057 falloc from 0x164c to 0x19d2 (0x386 bytes)
In operation 9055, we dirtied sector [0, 4096), then in falloc, we call
btrfs_wait_ordered_range(inode, start=4096, len=4096), only expecting to
writeback any dirty data in [4096, 8192), but nothing else.
Unfortunately, in subpage case, above btrfs_wait_ordered_range() will
trigger writeback of the range [0, 64K), which includes the data at
[0, 4096).
And since at the call site, we haven't yet increased i_size, which is
still 707, this means cow_file_range() can insert an inline extent.
Resulting above inline + regular extent.
[WORKAROUND]
I don't really have any good short-term solution yet, as this means all
operations that would trigger writeback need to be reviewed for any
i_size change.
So here I choose to disable inline extent creation for subpage case as a
workaround. We have done tons of work just to avoid such extent, so I
don't to create an exception just for subpage.
This only affects inline extent creation, subpage has no problem reading
existing inline extents at all.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-26 14:34:59 +08:00
|
|
|
/*
|
|
|
|
|
* Check cow_file_range() for why we don't even try to create inline
|
|
|
|
|
* extent for subpage case.
|
|
|
|
|
*/
|
|
|
|
|
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
/* lets try to make an inline extent */
|
2017-09-15 01:57:26 +03:00
|
|
|
if (ret || total_in < actual_end) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
/* we didn't compress the entire range, try
|
2008-11-06 22:02:51 -05:00
|
|
|
* to make an uncompressed inline extent.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
*/
|
2021-11-16 14:03:45 -08:00
|
|
|
ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
|
2020-06-03 08:55:12 +03:00
|
|
|
0, BTRFS_COMPRESS_NONE,
|
2019-11-07 15:19:16 -08:00
|
|
|
NULL, false);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
} else {
|
2008-11-06 22:02:51 -05:00
|
|
|
/* try making a compressed inline extent */
|
2021-11-16 14:03:45 -08:00
|
|
|
ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
|
2011-03-28 08:30:38 +00:00
|
|
|
total_compressed,
|
2019-11-07 15:19:16 -08:00
|
|
|
compress_type, pages,
|
|
|
|
|
false);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret <= 0) {
|
2013-07-29 13:22:24 -04:00
|
|
|
unsigned long clear_flags = EXTENT_DELALLOC |
|
2017-10-19 14:15:55 -04:00
|
|
|
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
|
|
|
|
|
EXTENT_DO_ACCOUNTING;
|
2014-10-10 10:45:12 +01:00
|
|
|
unsigned long page_error_op;
|
|
|
|
|
|
|
|
|
|
page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
|
2013-07-29 13:22:24 -04:00
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
/*
|
2012-03-12 16:03:00 +01:00
|
|
|
* inline extent creation worked or returned error,
|
|
|
|
|
* we don't need to create any more async work items.
|
|
|
|
|
* Unlock and free up our temp pages.
|
2017-10-19 14:15:55 -04:00
|
|
|
*
|
|
|
|
|
* We use DO_ACCOUNTING here because we need the
|
|
|
|
|
* delalloc_release_metadata to be done _after_ we drop
|
|
|
|
|
* our outstanding extent for clearing delalloc for this
|
|
|
|
|
* range.
|
2008-11-06 22:02:51 -05:00
|
|
|
*/
|
2020-06-03 08:55:06 +03:00
|
|
|
extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
|
|
|
|
|
NULL,
|
2019-07-17 16:18:16 +03:00
|
|
|
clear_flags,
|
2016-07-19 16:50:36 +08:00
|
|
|
PAGE_UNLOCK |
|
2021-01-26 16:33:45 +08:00
|
|
|
PAGE_START_WRITEBACK |
|
2014-10-10 10:45:12 +01:00
|
|
|
page_error_op |
|
2013-07-29 11:20:47 -04:00
|
|
|
PAGE_END_WRITEBACK);
|
2019-07-17 14:41:45 +03:00
|
|
|
|
2020-07-28 16:39:26 +08:00
|
|
|
/*
|
|
|
|
|
* Ensure we only free the compressed pages if we have
|
|
|
|
|
* them allocated, as we can still reach here with
|
|
|
|
|
* inode_need_compress() == false.
|
|
|
|
|
*/
|
|
|
|
|
if (pages) {
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
|
WARN_ON(pages[i]->mapping);
|
|
|
|
|
put_page(pages[i]);
|
|
|
|
|
}
|
|
|
|
|
kfree(pages);
|
2019-07-17 14:41:45 +03:00
|
|
|
}
|
|
|
|
|
return 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (will_compress) {
|
|
|
|
|
/*
|
|
|
|
|
* we aren't doing an inline extent round the compressed size
|
|
|
|
|
* up to a block size boundary so the allocator does sane
|
|
|
|
|
* things
|
|
|
|
|
*/
|
2013-02-26 08:10:22 +00:00
|
|
|
total_compressed = ALIGN(total_compressed, blocksize);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* one last check to make sure the compression is really a
|
2017-06-06 14:41:15 +03:00
|
|
|
* win, compare the page count read with the blocks on disk,
|
|
|
|
|
* compression must free at least one sector size
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
*/
|
2021-09-27 15:21:59 +08:00
|
|
|
total_in = round_up(total_in, fs_info->sectorsize);
|
2017-06-06 14:41:15 +03:00
|
|
|
if (total_compressed + blocksize <= total_in) {
|
2019-07-17 14:41:44 +03:00
|
|
|
compressed_extents++;
|
2016-03-25 19:01:33 -07:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The async work queues will take care of doing actual
|
|
|
|
|
* allocation on disk for these compressed pages, and
|
|
|
|
|
* will submit them to the elevator.
|
|
|
|
|
*/
|
2019-03-12 17:20:25 +02:00
|
|
|
add_async_extent(async_chunk, start, total_in,
|
2017-02-14 19:04:07 +01:00
|
|
|
total_compressed, pages, nr_pages,
|
2016-03-25 19:01:33 -07:00
|
|
|
compress_type);
|
|
|
|
|
|
2017-10-03 18:06:01 +03:00
|
|
|
if (start + total_in < end) {
|
|
|
|
|
start += total_in;
|
2016-03-25 19:01:33 -07:00
|
|
|
pages = NULL;
|
|
|
|
|
cond_resched();
|
|
|
|
|
goto again;
|
|
|
|
|
}
|
2019-07-17 14:41:44 +03:00
|
|
|
return compressed_extents;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
}
|
2016-03-25 19:01:33 -07:00
|
|
|
if (pages) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
/*
|
|
|
|
|
* the compression code ran but failed to make things smaller,
|
|
|
|
|
* free any pages it allocated and our page pointer array
|
|
|
|
|
*/
|
2017-02-14 19:04:07 +01:00
|
|
|
for (i = 0; i < nr_pages; i++) {
|
2008-10-31 12:46:39 -04:00
|
|
|
WARN_ON(pages[i]->mapping);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(pages[i]);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
kfree(pages);
|
|
|
|
|
pages = NULL;
|
|
|
|
|
total_compressed = 0;
|
2017-02-14 19:04:07 +01:00
|
|
|
nr_pages = 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
|
|
|
|
/* flag the file so we don't compress in the future */
|
2016-06-22 18:54:23 -04:00
|
|
|
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
|
2017-07-17 19:17:20 +02:00
|
|
|
!(BTRFS_I(inode)->prop_compress)) {
|
2010-01-28 16:18:15 -05:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
|
2010-03-11 09:42:04 -05:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
2009-02-04 09:31:06 -05:00
|
|
|
cleanup_and_bail_uncompressed:
|
2016-03-25 19:01:33 -07:00
|
|
|
/*
|
|
|
|
|
* No compression, but we still need to write the pages in the file
|
|
|
|
|
* we've been given so far. redirty the locked page if it corresponds
|
|
|
|
|
* to our extent and set things up for the async work queue to run
|
|
|
|
|
* cow_file_range to do the normal delalloc dance.
|
|
|
|
|
*/
|
Btrfs: only associate the locked page with one async_chunk struct
The btrfs writepages function collects a large range of pages flagged
for delayed allocation, and then sends them down through the COW code
for processing. When compression is on, we allocate one async_chunk
structure for every 512K, and then run those pages through the
compression code for IO submission.
writepages starts all of this off with a single page, locked by the
original call to extent_write_cache_pages(), and it's important to keep
track of this page because it has already been through
clear_page_dirty_for_io().
The btrfs async_chunk struct has a pointer to the locked_page, and when
we're redirtying the page because compression had to fallback to
uncompressed IO, we use page->index to decide if a given async_chunk
struct really owns that page.
But, this is racey. If a given delalloc range is broken up into two
async_chunks (chunkA and chunkB), we can end up with something like
this:
compress_file_range(chunkA)
submit_compress_extents(chunkA)
submit compressed bios(chunkA)
put_page(locked_page)
compress_file_range(chunkB)
...
Or:
async_cow_submit
submit_compressed_extents <--- falls back to buffered writeout
cow_file_range
extent_clear_unlock_delalloc
__process_pages_contig
put_page(locked_pages)
async_cow_submit
The end result is that chunkA is completed and cleaned up before chunkB
even starts processing. This means we can free locked_page() and reuse
it elsewhere. If we get really lucky, it'll have the same page->index
in its new home as it did before.
While we're processing chunkB, we might decide we need to fall back to
uncompressed IO, and so compress_file_range() will call
__set_page_dirty_nobufers() on chunkB->locked_page.
Without cgroups in use, this creates as a phantom dirty page, which
isn't great but isn't the end of the world. What can happen, it can go
through the fixup worker and the whole COW machinery again:
in submit_compressed_extents():
while (async extents) {
...
cow_file_range
if (!page_started ...)
extent_write_locked_range
else if (...)
unlock_page
continue;
This hasn't been observed in practice but is still possible.
With cgroups in use, we might crash in the accounting code because
page->mapping->i_wb isn't set.
BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0
IP: percpu_counter_add_batch+0x11/0x70
PGD 66534e067 P4D 66534e067 PUD 66534f067 PMD 0
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 16 PID: 2172 Comm: rm Not tainted
RIP: 0010:percpu_counter_add_batch+0x11/0x70
RSP: 0018:ffffc9000a97bbe0 EFLAGS: 00010286
RAX: 0000000000000005 RBX: 0000000000000090 RCX: 0000000000026115
RDX: 0000000000000030 RSI: ffffffffffffffff RDI: 0000000000000090
RBP: 0000000000000000 R08: fffffffffffffff5 R09: 0000000000000000
R10: 00000000000260c0 R11: ffff881037fc26c0 R12: ffffffffffffffff
R13: ffff880fe4111548 R14: ffffc9000a97bc90 R15: 0000000000000001
FS: 00007f5503ced480(0000) GS:ffff880ff7200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000000d0 CR3: 00000001e0459005 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
account_page_cleaned+0x15b/0x1f0
__cancel_dirty_page+0x146/0x200
truncate_cleanup_page+0x92/0xb0
truncate_inode_pages_range+0x202/0x7d0
btrfs_evict_inode+0x92/0x5a0
evict+0xc1/0x190
do_unlinkat+0x176/0x280
do_syscall_64+0x63/0x1a0
entry_SYSCALL_64_after_hwframe+0x42/0xb7
The fix here is to make asyc_chunk->locked_page NULL everywhere but the
one async_chunk struct that's allowed to do things to the locked page.
Link: https://lore.kernel.org/linux-btrfs/c2419d01-5c84-3fb4-189e-4db519d08796@suse.com/
Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Chris Mason <clm@fb.com>
[ update changelog from mail thread discussion ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-10 12:28:16 -07:00
|
|
|
if (async_chunk->locked_page &&
|
|
|
|
|
(page_offset(async_chunk->locked_page) >= start &&
|
|
|
|
|
page_offset(async_chunk->locked_page)) <= end) {
|
2019-03-12 17:20:27 +02:00
|
|
|
__set_page_dirty_nobuffers(async_chunk->locked_page);
|
2016-03-25 19:01:33 -07:00
|
|
|
/* unlocked later on in the async handlers */
|
Btrfs: only associate the locked page with one async_chunk struct
The btrfs writepages function collects a large range of pages flagged
for delayed allocation, and then sends them down through the COW code
for processing. When compression is on, we allocate one async_chunk
structure for every 512K, and then run those pages through the
compression code for IO submission.
writepages starts all of this off with a single page, locked by the
original call to extent_write_cache_pages(), and it's important to keep
track of this page because it has already been through
clear_page_dirty_for_io().
The btrfs async_chunk struct has a pointer to the locked_page, and when
we're redirtying the page because compression had to fallback to
uncompressed IO, we use page->index to decide if a given async_chunk
struct really owns that page.
But, this is racey. If a given delalloc range is broken up into two
async_chunks (chunkA and chunkB), we can end up with something like
this:
compress_file_range(chunkA)
submit_compress_extents(chunkA)
submit compressed bios(chunkA)
put_page(locked_page)
compress_file_range(chunkB)
...
Or:
async_cow_submit
submit_compressed_extents <--- falls back to buffered writeout
cow_file_range
extent_clear_unlock_delalloc
__process_pages_contig
put_page(locked_pages)
async_cow_submit
The end result is that chunkA is completed and cleaned up before chunkB
even starts processing. This means we can free locked_page() and reuse
it elsewhere. If we get really lucky, it'll have the same page->index
in its new home as it did before.
While we're processing chunkB, we might decide we need to fall back to
uncompressed IO, and so compress_file_range() will call
__set_page_dirty_nobufers() on chunkB->locked_page.
Without cgroups in use, this creates as a phantom dirty page, which
isn't great but isn't the end of the world. What can happen, it can go
through the fixup worker and the whole COW machinery again:
in submit_compressed_extents():
while (async extents) {
...
cow_file_range
if (!page_started ...)
extent_write_locked_range
else if (...)
unlock_page
continue;
This hasn't been observed in practice but is still possible.
With cgroups in use, we might crash in the accounting code because
page->mapping->i_wb isn't set.
BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0
IP: percpu_counter_add_batch+0x11/0x70
PGD 66534e067 P4D 66534e067 PUD 66534f067 PMD 0
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 16 PID: 2172 Comm: rm Not tainted
RIP: 0010:percpu_counter_add_batch+0x11/0x70
RSP: 0018:ffffc9000a97bbe0 EFLAGS: 00010286
RAX: 0000000000000005 RBX: 0000000000000090 RCX: 0000000000026115
RDX: 0000000000000030 RSI: ffffffffffffffff RDI: 0000000000000090
RBP: 0000000000000000 R08: fffffffffffffff5 R09: 0000000000000000
R10: 00000000000260c0 R11: ffff881037fc26c0 R12: ffffffffffffffff
R13: ffff880fe4111548 R14: ffffc9000a97bc90 R15: 0000000000000001
FS: 00007f5503ced480(0000) GS:ffff880ff7200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000000d0 CR3: 00000001e0459005 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
account_page_cleaned+0x15b/0x1f0
__cancel_dirty_page+0x146/0x200
truncate_cleanup_page+0x92/0xb0
truncate_inode_pages_range+0x202/0x7d0
btrfs_evict_inode+0x92/0x5a0
evict+0xc1/0x190
do_unlinkat+0x176/0x280
do_syscall_64+0x63/0x1a0
entry_SYSCALL_64_after_hwframe+0x42/0xb7
The fix here is to make asyc_chunk->locked_page NULL everywhere but the
one async_chunk struct that's allowed to do things to the locked page.
Link: https://lore.kernel.org/linux-btrfs/c2419d01-5c84-3fb4-189e-4db519d08796@suse.com/
Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Chris Mason <clm@fb.com>
[ update changelog from mail thread discussion ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-10 12:28:16 -07:00
|
|
|
}
|
2016-03-25 19:01:33 -07:00
|
|
|
|
|
|
|
|
if (redirty)
|
|
|
|
|
extent_range_redirty_for_io(inode, start, end);
|
2019-03-12 17:20:25 +02:00
|
|
|
add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
|
2016-03-25 19:01:33 -07:00
|
|
|
BTRFS_COMPRESS_NONE);
|
2019-07-17 14:41:44 +03:00
|
|
|
compressed_extents++;
|
2008-04-17 11:29:12 -04:00
|
|
|
|
2019-07-17 14:41:44 +03:00
|
|
|
return compressed_extents;
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
|
|
|
|
|
2014-10-06 22:14:24 +01:00
|
|
|
static void free_async_extent_pages(struct async_extent *async_extent)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
if (!async_extent->pages)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < async_extent->nr_pages; i++) {
|
|
|
|
|
WARN_ON(async_extent->pages[i]->mapping);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(async_extent->pages[i]);
|
2014-10-06 22:14:24 +01:00
|
|
|
}
|
|
|
|
|
kfree(async_extent->pages);
|
|
|
|
|
async_extent->nr_pages = 0;
|
|
|
|
|
async_extent->pages = NULL;
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
|
|
|
|
|
2021-09-27 15:22:03 +08:00
|
|
|
static int submit_uncompressed_range(struct btrfs_inode *inode,
|
|
|
|
|
struct async_extent *async_extent,
|
|
|
|
|
struct page *locked_page)
|
2008-11-06 22:02:51 -05:00
|
|
|
{
|
2021-09-27 15:22:03 +08:00
|
|
|
u64 start = async_extent->start;
|
|
|
|
|
u64 end = async_extent->start + async_extent->ram_size - 1;
|
|
|
|
|
unsigned long nr_written = 0;
|
|
|
|
|
int page_started = 0;
|
|
|
|
|
int ret;
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2021-09-27 15:22:03 +08:00
|
|
|
/*
|
|
|
|
|
* Call cow_file_range() to run the delalloc range directly, since we
|
|
|
|
|
* won't go to NOCOW or async path again.
|
|
|
|
|
*
|
|
|
|
|
* Also we call cow_file_range() with @unlock_page == 0, so that we
|
|
|
|
|
* can directly submit them without interruption.
|
|
|
|
|
*/
|
|
|
|
|
ret = cow_file_range(inode, locked_page, start, end, &page_started,
|
2022-07-09 08:18:49 +09:00
|
|
|
&nr_written, 0, NULL);
|
2021-09-27 15:22:03 +08:00
|
|
|
/* Inline extent inserted, page gets unlocked and everything is done */
|
|
|
|
|
if (page_started) {
|
|
|
|
|
ret = 0;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
if (ret < 0) {
|
2022-06-21 15:41:01 +09:00
|
|
|
btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
|
|
|
|
|
if (locked_page) {
|
|
|
|
|
const u64 page_start = page_offset(locked_page);
|
|
|
|
|
const u64 page_end = page_start + PAGE_SIZE - 1;
|
|
|
|
|
|
|
|
|
|
btrfs_page_set_error(inode->root->fs_info, locked_page,
|
|
|
|
|
page_start, PAGE_SIZE);
|
|
|
|
|
set_page_writeback(locked_page);
|
|
|
|
|
end_page_writeback(locked_page);
|
|
|
|
|
end_extent_writepage(locked_page, ret, page_start, page_end);
|
2021-09-27 15:22:03 +08:00
|
|
|
unlock_page(locked_page);
|
2022-06-21 15:41:01 +09:00
|
|
|
}
|
2021-09-27 15:22:03 +08:00
|
|
|
goto out;
|
|
|
|
|
}
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2021-09-27 15:22:03 +08:00
|
|
|
ret = extent_write_locked_range(&inode->vfs_inode, start, end);
|
|
|
|
|
/* All pages will be unlocked, including @locked_page */
|
|
|
|
|
out:
|
|
|
|
|
kfree(async_extent);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2012-03-12 16:03:00 +01:00
|
|
|
|
2021-09-27 15:21:57 +08:00
|
|
|
static int submit_one_async_extent(struct btrfs_inode *inode,
|
|
|
|
|
struct async_chunk *async_chunk,
|
|
|
|
|
struct async_extent *async_extent,
|
|
|
|
|
u64 *alloc_hint)
|
2008-11-06 22:02:51 -05:00
|
|
|
{
|
2021-09-27 15:21:57 +08:00
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-11-06 22:02:51 -05:00
|
|
|
struct btrfs_key ins;
|
2021-09-27 15:22:03 +08:00
|
|
|
struct page *locked_page = NULL;
|
2008-11-06 22:02:51 -05:00
|
|
|
struct extent_map *em;
|
2009-11-10 21:23:48 -05:00
|
|
|
int ret = 0;
|
2021-09-27 15:21:57 +08:00
|
|
|
u64 start = async_extent->start;
|
|
|
|
|
u64 end = async_extent->start + async_extent->ram_size - 1;
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2021-09-27 15:22:03 +08:00
|
|
|
/*
|
|
|
|
|
* If async_chunk->locked_page is in the async_extent range, we need to
|
|
|
|
|
* handle it.
|
|
|
|
|
*/
|
|
|
|
|
if (async_chunk->locked_page) {
|
|
|
|
|
u64 locked_page_start = page_offset(async_chunk->locked_page);
|
|
|
|
|
u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
|
2013-02-06 16:49:15 -05:00
|
|
|
|
2021-09-27 15:22:03 +08:00
|
|
|
if (!(start >= locked_page_end || end <= locked_page_start))
|
|
|
|
|
locked_page = async_chunk->locked_page;
|
2021-09-27 15:21:57 +08:00
|
|
|
}
|
2021-09-27 15:22:03 +08:00
|
|
|
lock_extent(io_tree, start, end);
|
2014-07-24 22:48:05 +08:00
|
|
|
|
2021-09-27 15:22:03 +08:00
|
|
|
/* We have fall back to uncompressed write */
|
|
|
|
|
if (!async_extent->pages)
|
|
|
|
|
return submit_uncompressed_range(inode, async_extent, locked_page);
|
2014-07-24 22:48:05 +08:00
|
|
|
|
2021-09-27 15:21:57 +08:00
|
|
|
ret = btrfs_reserve_extent(root, async_extent->ram_size,
|
|
|
|
|
async_extent->compressed_size,
|
|
|
|
|
async_extent->compressed_size,
|
|
|
|
|
0, *alloc_hint, &ins, 1, 1);
|
|
|
|
|
if (ret) {
|
|
|
|
|
free_async_extent_pages(async_extent);
|
2009-11-12 09:34:21 +00:00
|
|
|
/*
|
2021-09-27 15:21:57 +08:00
|
|
|
* Here we used to try again by going back to non-compressed
|
|
|
|
|
* path for ENOSPC. But we can't reserve space even for
|
|
|
|
|
* compressed size, how could it work for uncompressed size
|
|
|
|
|
* which requires larger size? So here we directly go error
|
|
|
|
|
* path.
|
2009-11-12 09:34:21 +00:00
|
|
|
*/
|
2021-09-27 15:21:57 +08:00
|
|
|
goto out_free;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Here we're doing allocation and writeback of the compressed pages */
|
|
|
|
|
em = create_io_em(inode, start,
|
|
|
|
|
async_extent->ram_size, /* len */
|
|
|
|
|
start, /* orig_start */
|
|
|
|
|
ins.objectid, /* block_start */
|
|
|
|
|
ins.offset, /* block_len */
|
|
|
|
|
ins.offset, /* orig_block_len */
|
|
|
|
|
async_extent->ram_size, /* ram_bytes */
|
|
|
|
|
async_extent->compress_type,
|
|
|
|
|
BTRFS_ORDERED_COMPRESSED);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto out_free_reserve;
|
|
|
|
|
}
|
|
|
|
|
free_extent_map(em);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-11-06 12:11:56 -08:00
|
|
|
ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
|
|
|
|
|
async_extent->ram_size, /* num_bytes */
|
|
|
|
|
async_extent->ram_size, /* ram_bytes */
|
|
|
|
|
ins.objectid, /* disk_bytenr */
|
|
|
|
|
ins.offset, /* disk_num_bytes */
|
|
|
|
|
0, /* offset */
|
|
|
|
|
1 << BTRFS_ORDERED_COMPRESSED,
|
|
|
|
|
async_extent->compress_type);
|
2021-09-27 15:21:57 +08:00
|
|
|
if (ret) {
|
|
|
|
|
btrfs_drop_extent_cache(inode, start, end, 0);
|
|
|
|
|
goto out_free_reserve;
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
2021-09-27 15:21:57 +08:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
|
|
|
|
|
|
|
|
|
/* Clear dirty, set writeback and unlock the pages. */
|
|
|
|
|
extent_clear_unlock_delalloc(inode, start, end,
|
|
|
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
|
|
|
|
|
PAGE_UNLOCK | PAGE_START_WRITEBACK);
|
|
|
|
|
if (btrfs_submit_compressed_write(inode, start, /* file_offset */
|
|
|
|
|
async_extent->ram_size, /* num_bytes */
|
|
|
|
|
ins.objectid, /* disk_bytenr */
|
|
|
|
|
ins.offset, /* compressed_len */
|
|
|
|
|
async_extent->pages, /* compressed_pages */
|
|
|
|
|
async_extent->nr_pages,
|
|
|
|
|
async_chunk->write_flags,
|
2019-08-13 16:00:02 -07:00
|
|
|
async_chunk->blkcg_css, true)) {
|
2021-09-27 15:21:57 +08:00
|
|
|
const u64 start = async_extent->start;
|
|
|
|
|
const u64 end = start + async_extent->ram_size - 1;
|
|
|
|
|
|
|
|
|
|
btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
|
|
|
|
|
|
|
|
|
|
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
|
|
|
|
|
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
|
|
|
|
|
free_async_extent_pages(async_extent);
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
2021-09-27 15:21:57 +08:00
|
|
|
*alloc_hint = ins.objectid + ins.offset;
|
|
|
|
|
kfree(async_extent);
|
|
|
|
|
return ret;
|
|
|
|
|
|
2013-02-06 16:49:15 -05:00
|
|
|
out_free_reserve:
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
|
2012-03-12 16:03:00 +01:00
|
|
|
out_free:
|
2021-09-27 15:21:57 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end,
|
2013-07-29 11:20:47 -04:00
|
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
EXTENT_DELALLOC_NEW |
|
2013-07-29 13:22:24 -04:00
|
|
|
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
|
2021-01-26 16:33:45 +08:00
|
|
|
PAGE_UNLOCK | PAGE_START_WRITEBACK |
|
|
|
|
|
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
|
2014-10-06 22:14:24 +01:00
|
|
|
free_async_extent_pages(async_extent);
|
2012-03-12 16:03:00 +01:00
|
|
|
kfree(async_extent);
|
2021-09-27 15:21:57 +08:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Phase two of compressed writeback. This is the ordered portion of the code,
|
|
|
|
|
* which only gets called in the order the work was queued. We walk all the
|
|
|
|
|
* async extents created by compress_file_range and send them down to the disk.
|
|
|
|
|
*/
|
|
|
|
|
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
|
|
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
while (!list_empty(&async_chunk->extents)) {
|
|
|
|
|
u64 extent_start;
|
|
|
|
|
u64 ram_size;
|
|
|
|
|
|
|
|
|
|
async_extent = list_entry(async_chunk->extents.next,
|
|
|
|
|
struct async_extent, list);
|
|
|
|
|
list_del(&async_extent->list);
|
|
|
|
|
extent_start = async_extent->start;
|
|
|
|
|
ram_size = async_extent->ram_size;
|
|
|
|
|
|
|
|
|
|
ret = submit_one_async_extent(inode, async_chunk, async_extent,
|
|
|
|
|
&alloc_hint);
|
|
|
|
|
btrfs_debug(fs_info,
|
|
|
|
|
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
|
|
|
|
|
inode->root->root_key.objectid,
|
|
|
|
|
btrfs_ino(inode), extent_start, ram_size, ret);
|
|
|
|
|
}
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:02 +03:00
|
|
|
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
|
2010-05-23 11:00:55 -04:00
|
|
|
u64 num_bytes)
|
|
|
|
|
{
|
2020-06-03 08:55:02 +03:00
|
|
|
struct extent_map_tree *em_tree = &inode->extent_tree;
|
2010-05-23 11:00:55 -04:00
|
|
|
struct extent_map *em;
|
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
|
|
|
|
|
|
read_lock(&em_tree->lock);
|
|
|
|
|
em = search_extent_mapping(em_tree, start, num_bytes);
|
|
|
|
|
if (em) {
|
|
|
|
|
/*
|
|
|
|
|
* if block start isn't an actual block number then find the
|
|
|
|
|
* first block in this inode and use that as a hint. If that
|
|
|
|
|
* block is also bogus then just don't worry about it.
|
|
|
|
|
*/
|
|
|
|
|
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
em = search_extent_mapping(em_tree, 0, 0);
|
|
|
|
|
if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
|
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
|
if (em)
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
} else {
|
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
read_unlock(&em_tree->lock);
|
|
|
|
|
|
|
|
|
|
return alloc_hint;
|
|
|
|
|
}
|
|
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
/*
|
|
|
|
|
* when extent_io.c finds a delayed allocation range in the file,
|
|
|
|
|
* the call backs end up in this code. The basic idea is to
|
|
|
|
|
* allocate extents on disk for the range, and create ordered data structs
|
|
|
|
|
* in ram to track those extents.
|
|
|
|
|
*
|
|
|
|
|
* locked_page is the page that writepage had locked already. We use
|
|
|
|
|
* it to make sure we don't do extra locks or unlocks.
|
|
|
|
|
*
|
|
|
|
|
* *page_started is set to one if we unlock locked_page and do everything
|
|
|
|
|
* required to start IO on it. It may be clean and already done with
|
|
|
|
|
* IO when we return.
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
*
|
|
|
|
|
* When unlock == 1, we unlock the pages in successfully allocated regions.
|
|
|
|
|
* When unlock == 0, we leave them locked for writing them out.
|
|
|
|
|
*
|
|
|
|
|
* However, we unlock all the pages except @locked_page in case of failure.
|
|
|
|
|
*
|
|
|
|
|
* In summary, page locking state will be as follow:
|
|
|
|
|
*
|
|
|
|
|
* - page_started == 1 (return value)
|
|
|
|
|
* - All the pages are unlocked. IO is started.
|
|
|
|
|
* - Note that this can happen only on success
|
|
|
|
|
* - unlock == 1
|
|
|
|
|
* - All the pages except @locked_page are unlocked in any case
|
|
|
|
|
* - unlock == 0
|
|
|
|
|
* - On success, all the pages are locked for writing out them
|
|
|
|
|
* - On failure, all the pages except @locked_page are unlocked
|
|
|
|
|
*
|
|
|
|
|
* When a failure happens in the second or later iteration of the
|
|
|
|
|
* while-loop, the ordered extents created in previous iterations are kept
|
|
|
|
|
* intact. So, the caller must clean them up by calling
|
|
|
|
|
* btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
|
|
|
|
|
* example.
|
2008-11-06 22:02:51 -05:00
|
|
|
*/
|
2020-06-03 08:55:14 +03:00
|
|
|
static noinline int cow_file_range(struct btrfs_inode *inode,
|
2013-08-14 14:02:47 -04:00
|
|
|
struct page *locked_page,
|
2019-07-17 16:18:16 +03:00
|
|
|
u64 start, u64 end, int *page_started,
|
2022-07-09 08:18:49 +09:00
|
|
|
unsigned long *nr_written, int unlock,
|
|
|
|
|
u64 *done_offset)
|
2008-11-06 22:02:51 -05:00
|
|
|
{
|
2020-06-03 08:55:14 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-11-06 22:02:51 -05:00
|
|
|
u64 alloc_hint = 0;
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
u64 orig_start = start;
|
2008-11-06 22:02:51 -05:00
|
|
|
u64 num_bytes;
|
|
|
|
|
unsigned long ram_size;
|
2017-03-06 23:04:20 +00:00
|
|
|
u64 cur_alloc_size = 0;
|
btrfs: fix data block group relocation failure due to concurrent scrub
When running relocation of a data block group while scrub is running in
parallel, it is possible that the relocation will fail and abort the
current transaction with an -EINVAL error:
[134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents
[134243.999871] ------------[ cut here ]------------
[134244.000741] BTRFS: Transaction aborted (error -22)
[134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
[134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.017151] Code: 48 c7 c7 (...)
[134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286
[134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000
[134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001
[134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001
[134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08
[134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000
[134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000
[134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0
[134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134244.034484] Call Trace:
[134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs]
[134244.035859] do_relocation+0x30b/0x790 [btrfs]
[134244.036681] ? do_raw_spin_unlock+0x49/0xc0
[134244.037460] ? _raw_spin_unlock+0x29/0x40
[134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs]
[134244.039245] relocate_block_group+0x388/0x770 [btrfs]
[134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs]
[134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs]
[134244.041345] btrfs_balance+0xc06/0x1860 [btrfs]
[134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs]
[134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs]
[134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs]
[134244.049043] ? do_raw_spin_unlock+0x49/0xc0
[134244.049838] ? _raw_spin_unlock+0x29/0x40
[134244.050587] ? __handle_mm_fault+0x11b3/0x14b0
[134244.051417] ? ksys_ioctl+0x92/0xb0
[134244.052070] ksys_ioctl+0x92/0xb0
[134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c
[134244.053511] __x64_sys_ioctl+0x16/0x20
[134244.054206] do_syscall_64+0x5c/0x280
[134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[134244.055819] RIP: 0033:0x7f29b51c9dd7
[134244.056491] Code: 00 00 00 (...)
[134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7
[134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003
[134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000
[134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a
[134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0
[134244.067626] irq event stamp: 0
[134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.070909] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134244.073432] ---[ end trace bd7c03622e0b0a99 ]---
The -EINVAL error comes from the following chain of function calls:
__btrfs_cow_block() <-- aborts the transaction
btrfs_reloc_cow_block()
replace_file_extents()
get_new_location() <-- returns -EINVAL
When relocating a data block group, for each allocated extent of the block
group, we preallocate another extent (at prealloc_file_extent_cluster()),
associated with the data relocation inode, and then dirty all its pages.
These preallocated extents have, and must have, the same size that extents
from the data block group being relocated have.
Later before we start the relocation stage that updates pointers (bytenr
field of file extent items) to point to the the new extents, we trigger
writeback for the data relocation inode. The expectation is that writeback
will write the pages to the previously preallocated extents, that it
follows the NOCOW path. That is generally the case, however, if a scrub
is running it may have turned the block group that contains those extents
into RO mode, in which case writeback falls back to the COW path.
However in the COW path instead of allocating exactly one extent with the
expected size, the allocator may end up allocating several smaller extents
due to free space fragmentation - because we tell it at cow_file_range()
that the minimum allocation size can match the filesystem's sector size.
This later breaks the relocation's expectation that an extent associated
to a file extent item in the data relocation inode has the same size as
the respective extent pointed by a file extent item in another tree - in
this case the extent to which the relocation inode poins to is smaller,
causing relocation.c:get_new_location() to return -EINVAL.
For example, if we are relocating a data block group X that has a logical
address of X and the block group has an extent allocated at the logical
address X + 128KiB with a size of 64KiB:
1) At prealloc_file_extent_cluster() we allocate an extent for the data
relocation inode with a size of 64KiB and associate it to the file
offset 128KiB (X + 128KiB - X) of the data relocation inode. This
preallocated extent was allocated at block group Z;
2) A scrub running in parallel turns block group Z into RO mode and
starts scrubing its extents;
3) Relocation triggers writeback for the data relocation inode;
4) When running delalloc (btrfs_run_delalloc_range()), we try first the
NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC
set in its flags. However, because block group Z is in RO mode, the
NOCOW path (run_delalloc_nocow()) falls back into the COW path, by
calling cow_file_range();
5) At cow_file_range(), in the first iteration of the while loop we call
btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum
allocation size of 4KiB (fs_info->sectorsize). Due to free space
fragmentation, btrfs_reserve_extent() ends up allocating two extents
of 32KiB each, each one on a different iteration of that while loop;
6) Writeback of the data relocation inode completes;
7) Relocation proceeds and ends up at relocation.c:replace_file_extents(),
with a leaf which has a file extent item that points to the data extent
from block group X, that has a logical address (bytenr) of X + 128KiB
and a size of 64KiB. Then it calls get_new_location(), which does a
lookup in the data relocation tree for a file extent item starting at
offset 128KiB (X + 128KiB - X) and belonging to the data relocation
inode. It finds a corresponding file extent item, however that item
points to an extent that has a size of 32KiB, which doesn't match the
expected size of 64KiB, resuling in -EINVAL being returned from this
function and propagated up to __btrfs_cow_block(), which aborts the
current transaction.
To fix this make sure that at cow_file_range() when we call the allocator
we pass it a minimum allocation size corresponding the desired extent size
if the inode belongs to the data relocation tree, otherwise pass it the
filesystem's sector size as the minimum allocation size.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:32:55 +01:00
|
|
|
u64 min_alloc_size;
|
2016-06-22 18:54:23 -04:00
|
|
|
u64 blocksize = fs_info->sectorsize;
|
2008-11-06 22:02:51 -05:00
|
|
|
struct btrfs_key ins;
|
|
|
|
|
struct extent_map *em;
|
2017-03-06 23:04:20 +00:00
|
|
|
unsigned clear_bits;
|
|
|
|
|
unsigned long page_ops;
|
|
|
|
|
bool extent_reserved = false;
|
2008-11-06 22:02:51 -05:00
|
|
|
int ret = 0;
|
|
|
|
|
|
2020-06-03 08:55:14 +03:00
|
|
|
if (btrfs_is_free_space_inode(inode)) {
|
2014-02-07 12:21:23 -05:00
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out_unlock;
|
2013-10-25 16:19:08 -04:00
|
|
|
}
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2013-02-26 08:10:22 +00:00
|
|
|
num_bytes = ALIGN(end - start + 1, blocksize);
|
2008-11-06 22:02:51 -05:00
|
|
|
num_bytes = max(blocksize, num_bytes);
|
2018-02-15 18:07:59 +08:00
|
|
|
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2020-06-03 08:55:14 +03:00
|
|
|
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
|
2011-05-24 15:35:30 -04:00
|
|
|
|
btrfs: subpage: disable inline extent creation
[BUG]
When running the following fsx command (extracted from generic/127) on
subpage filesystem, it can create inline extent with regular extents:
fsx -q -l 262144 -o 65536 -S 191110531 -N 9057 -R -W $mnt/file > /tmp/fsx
The offending extent would look like:
item 9 key (257 INODE_REF 256) itemoff 15703 itemsize 14
index 2 namelen 4 name: file
item 10 key (257 EXTENT_DATA 0) itemoff 14975 itemsize 728
generation 7 type 0 (inline)
inline extent data size 707 ram_bytes 707 compression 0 (none)
item 11 key (257 EXTENT_DATA 4096) itemoff 14922 itemsize 53
generation 7 type 2 (prealloc)
prealloc data disk byte 102346752 nr 4096
prealloc data offset 0 nr 4096
[CAUSE]
For subpage filesystem, the writeback is triggered in page units, which
means, even if we just want to writeback range [16K, 20K) for 64K page
system, we will still try to writeback any dirty sector of range [0, 64K).
This is never a problem if sectorsize == PAGE_SIZE, but for subpage,
this can cause unexpected problems.
For above test case, the last several operations from fsx are:
9055 trunc from 0x40000 to 0x2c3
9057 falloc from 0x164c to 0x19d2 (0x386 bytes)
In operation 9055, we dirtied sector [0, 4096), then in falloc, we call
btrfs_wait_ordered_range(inode, start=4096, len=4096), only expecting to
writeback any dirty data in [4096, 8192), but nothing else.
Unfortunately, in subpage case, above btrfs_wait_ordered_range() will
trigger writeback of the range [0, 64K), which includes the data at
[0, 4096).
And since at the call site, we haven't yet increased i_size, which is
still 707, this means cow_file_range() can insert an inline extent.
Resulting above inline + regular extent.
[WORKAROUND]
I don't really have any good short-term solution yet, as this means all
operations that would trigger writeback need to be reviewed for any
i_size change.
So here I choose to disable inline extent creation for subpage case as a
workaround. We have done tons of work just to avoid such extent, so I
don't to create an exception just for subpage.
This only affects inline extent creation, subpage has no problem reading
existing inline extents at all.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-26 14:34:59 +08:00
|
|
|
/*
|
|
|
|
|
* Due to the page size limit, for subpage we can only trigger the
|
|
|
|
|
* writeback for the dirty sectors of page, that means data writeback
|
|
|
|
|
* is doing more writeback than what we want.
|
|
|
|
|
*
|
|
|
|
|
* This is especially unexpected for some call sites like fallocate,
|
|
|
|
|
* where we only increase i_size after everything is done.
|
|
|
|
|
* This means we can trigger inline extent even if we didn't want to.
|
|
|
|
|
* So here we skip inline extent creation completely.
|
|
|
|
|
*/
|
|
|
|
|
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
|
2021-11-16 14:03:45 -08:00
|
|
|
u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
|
|
|
|
|
end + 1);
|
|
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
/* lets try to make an inline extent */
|
2021-11-16 14:03:45 -08:00
|
|
|
ret = cow_file_range_inline(inode, actual_end, 0,
|
2019-11-07 15:19:16 -08:00
|
|
|
BTRFS_COMPRESS_NONE, NULL, false);
|
2008-11-06 22:02:51 -05:00
|
|
|
if (ret == 0) {
|
2017-10-19 14:15:55 -04:00
|
|
|
/*
|
|
|
|
|
* We use DO_ACCOUNTING here because we need the
|
|
|
|
|
* delalloc_release_metadata to be run _after_ we drop
|
|
|
|
|
* our outstanding extent for clearing delalloc for this
|
|
|
|
|
* range.
|
|
|
|
|
*/
|
btrfs: prevent extent_clear_unlock_delalloc() to unlock page not locked by __process_pages_contig()
In cow_file_range(), after we have succeeded creating an inline extent,
we unlock the page with extent_clear_unlock_delalloc() by passing
locked_page == NULL.
For sectorsize == PAGE_SIZE case, this is just making the page lock and
unlock harder to grab.
But for incoming subpage case, it can be a big problem.
For incoming subpage case, page locking have two entry points:
- __process_pages_contig()
In that case, we know exactly the range we want to lock (which only
requires sector alignment).
To handle the subpage requirement, we introduce btrfs_subpage::writers
to page::private, and will update it in __process_pages_contig().
- Other directly lock/unlock_page() call sites
Those won't touch btrfs_subpage::writers at all.
This means, page locked by __process_pages_contig() can only be unlocked
by __process_pages_contig().
Thankfully we already have the existing infrastructure in the form of
@locked_page in various call sites.
Unfortunately, extent_clear_unlock_delalloc() in cow_file_range() after
creating an inline extent is the exception.
It intentionally call extent_clear_unlock_delalloc() with locked_page ==
NULL, to also unlock current page (and clear its dirty/writeback bits).
To co-operate with incoming subpage modifications, and make the page
lock/unlock pair easier to understand, this patch will still call
extent_clear_unlock_delalloc() with locked_page, and only unlock the
page in __extent_writepage().
Tested-by: Ritesh Harjani <riteshh@linux.ibm.com> # [ppc64]
Tested-by: Anand Jain <anand.jain@oracle.com> # [aarch64]
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-05-31 16:50:48 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end,
|
|
|
|
|
locked_page,
|
2013-07-29 11:20:47 -04:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2017-10-19 14:15:55 -04:00
|
|
|
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
|
|
|
|
|
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
|
2021-01-26 16:33:45 +08:00
|
|
|
PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
|
2008-11-06 22:02:51 -05:00
|
|
|
*nr_written = *nr_written +
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
(end - start + PAGE_SIZE) / PAGE_SIZE;
|
2008-11-06 22:02:51 -05:00
|
|
|
*page_started = 1;
|
btrfs: prevent extent_clear_unlock_delalloc() to unlock page not locked by __process_pages_contig()
In cow_file_range(), after we have succeeded creating an inline extent,
we unlock the page with extent_clear_unlock_delalloc() by passing
locked_page == NULL.
For sectorsize == PAGE_SIZE case, this is just making the page lock and
unlock harder to grab.
But for incoming subpage case, it can be a big problem.
For incoming subpage case, page locking have two entry points:
- __process_pages_contig()
In that case, we know exactly the range we want to lock (which only
requires sector alignment).
To handle the subpage requirement, we introduce btrfs_subpage::writers
to page::private, and will update it in __process_pages_contig().
- Other directly lock/unlock_page() call sites
Those won't touch btrfs_subpage::writers at all.
This means, page locked by __process_pages_contig() can only be unlocked
by __process_pages_contig().
Thankfully we already have the existing infrastructure in the form of
@locked_page in various call sites.
Unfortunately, extent_clear_unlock_delalloc() in cow_file_range() after
creating an inline extent is the exception.
It intentionally call extent_clear_unlock_delalloc() with locked_page ==
NULL, to also unlock current page (and clear its dirty/writeback bits).
To co-operate with incoming subpage modifications, and make the page
lock/unlock pair easier to understand, this patch will still call
extent_clear_unlock_delalloc() with locked_page, and only unlock the
page in __extent_writepage().
Tested-by: Ritesh Harjani <riteshh@linux.ibm.com> # [ppc64]
Tested-by: Anand Jain <anand.jain@oracle.com> # [aarch64]
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-05-31 16:50:48 +08:00
|
|
|
/*
|
|
|
|
|
* locked_page is locked by the caller of
|
|
|
|
|
* writepage_delalloc(), not locked by
|
|
|
|
|
* __process_pages_contig().
|
|
|
|
|
*
|
|
|
|
|
* We can't let __process_pages_contig() to unlock it,
|
|
|
|
|
* as it doesn't have any subpage::writers recorded.
|
|
|
|
|
*
|
|
|
|
|
* Here we manually unlock the page, since the caller
|
|
|
|
|
* can't use page_started to determine if it's an
|
|
|
|
|
* inline extent or a compressed extent.
|
|
|
|
|
*/
|
|
|
|
|
unlock_page(locked_page);
|
2008-11-06 22:02:51 -05:00
|
|
|
goto out;
|
2012-03-12 16:03:00 +01:00
|
|
|
} else if (ret < 0) {
|
|
|
|
|
goto out_unlock;
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:14 +03:00
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
|
|
|
|
|
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
btrfs: fix data block group relocation failure due to concurrent scrub
When running relocation of a data block group while scrub is running in
parallel, it is possible that the relocation will fail and abort the
current transaction with an -EINVAL error:
[134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents
[134243.999871] ------------[ cut here ]------------
[134244.000741] BTRFS: Transaction aborted (error -22)
[134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
[134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.017151] Code: 48 c7 c7 (...)
[134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286
[134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000
[134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001
[134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001
[134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08
[134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000
[134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000
[134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0
[134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134244.034484] Call Trace:
[134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs]
[134244.035859] do_relocation+0x30b/0x790 [btrfs]
[134244.036681] ? do_raw_spin_unlock+0x49/0xc0
[134244.037460] ? _raw_spin_unlock+0x29/0x40
[134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs]
[134244.039245] relocate_block_group+0x388/0x770 [btrfs]
[134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs]
[134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs]
[134244.041345] btrfs_balance+0xc06/0x1860 [btrfs]
[134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs]
[134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs]
[134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs]
[134244.049043] ? do_raw_spin_unlock+0x49/0xc0
[134244.049838] ? _raw_spin_unlock+0x29/0x40
[134244.050587] ? __handle_mm_fault+0x11b3/0x14b0
[134244.051417] ? ksys_ioctl+0x92/0xb0
[134244.052070] ksys_ioctl+0x92/0xb0
[134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c
[134244.053511] __x64_sys_ioctl+0x16/0x20
[134244.054206] do_syscall_64+0x5c/0x280
[134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[134244.055819] RIP: 0033:0x7f29b51c9dd7
[134244.056491] Code: 00 00 00 (...)
[134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7
[134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003
[134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000
[134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a
[134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0
[134244.067626] irq event stamp: 0
[134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.070909] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134244.073432] ---[ end trace bd7c03622e0b0a99 ]---
The -EINVAL error comes from the following chain of function calls:
__btrfs_cow_block() <-- aborts the transaction
btrfs_reloc_cow_block()
replace_file_extents()
get_new_location() <-- returns -EINVAL
When relocating a data block group, for each allocated extent of the block
group, we preallocate another extent (at prealloc_file_extent_cluster()),
associated with the data relocation inode, and then dirty all its pages.
These preallocated extents have, and must have, the same size that extents
from the data block group being relocated have.
Later before we start the relocation stage that updates pointers (bytenr
field of file extent items) to point to the the new extents, we trigger
writeback for the data relocation inode. The expectation is that writeback
will write the pages to the previously preallocated extents, that it
follows the NOCOW path. That is generally the case, however, if a scrub
is running it may have turned the block group that contains those extents
into RO mode, in which case writeback falls back to the COW path.
However in the COW path instead of allocating exactly one extent with the
expected size, the allocator may end up allocating several smaller extents
due to free space fragmentation - because we tell it at cow_file_range()
that the minimum allocation size can match the filesystem's sector size.
This later breaks the relocation's expectation that an extent associated
to a file extent item in the data relocation inode has the same size as
the respective extent pointed by a file extent item in another tree - in
this case the extent to which the relocation inode poins to is smaller,
causing relocation.c:get_new_location() to return -EINVAL.
For example, if we are relocating a data block group X that has a logical
address of X and the block group has an extent allocated at the logical
address X + 128KiB with a size of 64KiB:
1) At prealloc_file_extent_cluster() we allocate an extent for the data
relocation inode with a size of 64KiB and associate it to the file
offset 128KiB (X + 128KiB - X) of the data relocation inode. This
preallocated extent was allocated at block group Z;
2) A scrub running in parallel turns block group Z into RO mode and
starts scrubing its extents;
3) Relocation triggers writeback for the data relocation inode;
4) When running delalloc (btrfs_run_delalloc_range()), we try first the
NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC
set in its flags. However, because block group Z is in RO mode, the
NOCOW path (run_delalloc_nocow()) falls back into the COW path, by
calling cow_file_range();
5) At cow_file_range(), in the first iteration of the while loop we call
btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum
allocation size of 4KiB (fs_info->sectorsize). Due to free space
fragmentation, btrfs_reserve_extent() ends up allocating two extents
of 32KiB each, each one on a different iteration of that while loop;
6) Writeback of the data relocation inode completes;
7) Relocation proceeds and ends up at relocation.c:replace_file_extents(),
with a leaf which has a file extent item that points to the data extent
from block group X, that has a logical address (bytenr) of X + 128KiB
and a size of 64KiB. Then it calls get_new_location(), which does a
lookup in the data relocation tree for a file extent item starting at
offset 128KiB (X + 128KiB - X) and belonging to the data relocation
inode. It finds a corresponding file extent item, however that item
points to an extent that has a size of 32KiB, which doesn't match the
expected size of 64KiB, resuling in -EINVAL being returned from this
function and propagated up to __btrfs_cow_block(), which aborts the
current transaction.
To fix this make sure that at cow_file_range() when we call the allocator
we pass it a minimum allocation size corresponding the desired extent size
if the inode belongs to the data relocation tree, otherwise pass it the
filesystem's sector size as the minimum allocation size.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:32:55 +01:00
|
|
|
/*
|
|
|
|
|
* Relocation relies on the relocated extents to have exactly the same
|
|
|
|
|
* size as the original extents. Normally writeback for relocation data
|
|
|
|
|
* extents follows a NOCOW path because relocation preallocates the
|
|
|
|
|
* extents. However, due to an operation such as scrub turning a block
|
|
|
|
|
* group to RO mode, it may fallback to COW mode, so we must make sure
|
|
|
|
|
* an extent allocated during COW has exactly the requested size and can
|
|
|
|
|
* not be split into smaller extents, otherwise relocation breaks and
|
|
|
|
|
* fails during the stage where it updates the bytenr of file extent
|
|
|
|
|
* items.
|
|
|
|
|
*/
|
2021-09-09 01:19:25 +09:00
|
|
|
if (btrfs_is_data_reloc_root(root))
|
btrfs: fix data block group relocation failure due to concurrent scrub
When running relocation of a data block group while scrub is running in
parallel, it is possible that the relocation will fail and abort the
current transaction with an -EINVAL error:
[134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents
[134243.999871] ------------[ cut here ]------------
[134244.000741] BTRFS: Transaction aborted (error -22)
[134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
[134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.017151] Code: 48 c7 c7 (...)
[134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286
[134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000
[134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001
[134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001
[134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08
[134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000
[134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000
[134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0
[134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134244.034484] Call Trace:
[134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs]
[134244.035859] do_relocation+0x30b/0x790 [btrfs]
[134244.036681] ? do_raw_spin_unlock+0x49/0xc0
[134244.037460] ? _raw_spin_unlock+0x29/0x40
[134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs]
[134244.039245] relocate_block_group+0x388/0x770 [btrfs]
[134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs]
[134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs]
[134244.041345] btrfs_balance+0xc06/0x1860 [btrfs]
[134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs]
[134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs]
[134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs]
[134244.049043] ? do_raw_spin_unlock+0x49/0xc0
[134244.049838] ? _raw_spin_unlock+0x29/0x40
[134244.050587] ? __handle_mm_fault+0x11b3/0x14b0
[134244.051417] ? ksys_ioctl+0x92/0xb0
[134244.052070] ksys_ioctl+0x92/0xb0
[134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c
[134244.053511] __x64_sys_ioctl+0x16/0x20
[134244.054206] do_syscall_64+0x5c/0x280
[134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[134244.055819] RIP: 0033:0x7f29b51c9dd7
[134244.056491] Code: 00 00 00 (...)
[134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7
[134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003
[134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000
[134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a
[134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0
[134244.067626] irq event stamp: 0
[134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.070909] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134244.073432] ---[ end trace bd7c03622e0b0a99 ]---
The -EINVAL error comes from the following chain of function calls:
__btrfs_cow_block() <-- aborts the transaction
btrfs_reloc_cow_block()
replace_file_extents()
get_new_location() <-- returns -EINVAL
When relocating a data block group, for each allocated extent of the block
group, we preallocate another extent (at prealloc_file_extent_cluster()),
associated with the data relocation inode, and then dirty all its pages.
These preallocated extents have, and must have, the same size that extents
from the data block group being relocated have.
Later before we start the relocation stage that updates pointers (bytenr
field of file extent items) to point to the the new extents, we trigger
writeback for the data relocation inode. The expectation is that writeback
will write the pages to the previously preallocated extents, that it
follows the NOCOW path. That is generally the case, however, if a scrub
is running it may have turned the block group that contains those extents
into RO mode, in which case writeback falls back to the COW path.
However in the COW path instead of allocating exactly one extent with the
expected size, the allocator may end up allocating several smaller extents
due to free space fragmentation - because we tell it at cow_file_range()
that the minimum allocation size can match the filesystem's sector size.
This later breaks the relocation's expectation that an extent associated
to a file extent item in the data relocation inode has the same size as
the respective extent pointed by a file extent item in another tree - in
this case the extent to which the relocation inode poins to is smaller,
causing relocation.c:get_new_location() to return -EINVAL.
For example, if we are relocating a data block group X that has a logical
address of X and the block group has an extent allocated at the logical
address X + 128KiB with a size of 64KiB:
1) At prealloc_file_extent_cluster() we allocate an extent for the data
relocation inode with a size of 64KiB and associate it to the file
offset 128KiB (X + 128KiB - X) of the data relocation inode. This
preallocated extent was allocated at block group Z;
2) A scrub running in parallel turns block group Z into RO mode and
starts scrubing its extents;
3) Relocation triggers writeback for the data relocation inode;
4) When running delalloc (btrfs_run_delalloc_range()), we try first the
NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC
set in its flags. However, because block group Z is in RO mode, the
NOCOW path (run_delalloc_nocow()) falls back into the COW path, by
calling cow_file_range();
5) At cow_file_range(), in the first iteration of the while loop we call
btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum
allocation size of 4KiB (fs_info->sectorsize). Due to free space
fragmentation, btrfs_reserve_extent() ends up allocating two extents
of 32KiB each, each one on a different iteration of that while loop;
6) Writeback of the data relocation inode completes;
7) Relocation proceeds and ends up at relocation.c:replace_file_extents(),
with a leaf which has a file extent item that points to the data extent
from block group X, that has a logical address (bytenr) of X + 128KiB
and a size of 64KiB. Then it calls get_new_location(), which does a
lookup in the data relocation tree for a file extent item starting at
offset 128KiB (X + 128KiB - X) and belonging to the data relocation
inode. It finds a corresponding file extent item, however that item
points to an extent that has a size of 32KiB, which doesn't match the
expected size of 64KiB, resuling in -EINVAL being returned from this
function and propagated up to __btrfs_cow_block(), which aborts the
current transaction.
To fix this make sure that at cow_file_range() when we call the allocator
we pass it a minimum allocation size corresponding the desired extent size
if the inode belongs to the data relocation tree, otherwise pass it the
filesystem's sector size as the minimum allocation size.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:32:55 +01:00
|
|
|
min_alloc_size = num_bytes;
|
|
|
|
|
else
|
|
|
|
|
min_alloc_size = fs_info->sectorsize;
|
|
|
|
|
|
2018-02-15 12:29:38 +08:00
|
|
|
while (num_bytes > 0) {
|
|
|
|
|
cur_alloc_size = num_bytes;
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
|
btrfs: fix data block group relocation failure due to concurrent scrub
When running relocation of a data block group while scrub is running in
parallel, it is possible that the relocation will fail and abort the
current transaction with an -EINVAL error:
[134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents
[134243.999871] ------------[ cut here ]------------
[134244.000741] BTRFS: Transaction aborted (error -22)
[134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
[134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs]
[134244.017151] Code: 48 c7 c7 (...)
[134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286
[134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000
[134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001
[134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001
[134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08
[134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000
[134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000
[134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0
[134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134244.034484] Call Trace:
[134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs]
[134244.035859] do_relocation+0x30b/0x790 [btrfs]
[134244.036681] ? do_raw_spin_unlock+0x49/0xc0
[134244.037460] ? _raw_spin_unlock+0x29/0x40
[134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs]
[134244.039245] relocate_block_group+0x388/0x770 [btrfs]
[134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs]
[134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs]
[134244.041345] btrfs_balance+0xc06/0x1860 [btrfs]
[134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs]
[134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs]
[134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs]
[134244.049043] ? do_raw_spin_unlock+0x49/0xc0
[134244.049838] ? _raw_spin_unlock+0x29/0x40
[134244.050587] ? __handle_mm_fault+0x11b3/0x14b0
[134244.051417] ? ksys_ioctl+0x92/0xb0
[134244.052070] ksys_ioctl+0x92/0xb0
[134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c
[134244.053511] __x64_sys_ioctl+0x16/0x20
[134244.054206] do_syscall_64+0x5c/0x280
[134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[134244.055819] RIP: 0033:0x7f29b51c9dd7
[134244.056491] Code: 00 00 00 (...)
[134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7
[134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003
[134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000
[134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a
[134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0
[134244.067626] irq event stamp: 0
[134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.070909] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134244.073432] ---[ end trace bd7c03622e0b0a99 ]---
The -EINVAL error comes from the following chain of function calls:
__btrfs_cow_block() <-- aborts the transaction
btrfs_reloc_cow_block()
replace_file_extents()
get_new_location() <-- returns -EINVAL
When relocating a data block group, for each allocated extent of the block
group, we preallocate another extent (at prealloc_file_extent_cluster()),
associated with the data relocation inode, and then dirty all its pages.
These preallocated extents have, and must have, the same size that extents
from the data block group being relocated have.
Later before we start the relocation stage that updates pointers (bytenr
field of file extent items) to point to the the new extents, we trigger
writeback for the data relocation inode. The expectation is that writeback
will write the pages to the previously preallocated extents, that it
follows the NOCOW path. That is generally the case, however, if a scrub
is running it may have turned the block group that contains those extents
into RO mode, in which case writeback falls back to the COW path.
However in the COW path instead of allocating exactly one extent with the
expected size, the allocator may end up allocating several smaller extents
due to free space fragmentation - because we tell it at cow_file_range()
that the minimum allocation size can match the filesystem's sector size.
This later breaks the relocation's expectation that an extent associated
to a file extent item in the data relocation inode has the same size as
the respective extent pointed by a file extent item in another tree - in
this case the extent to which the relocation inode poins to is smaller,
causing relocation.c:get_new_location() to return -EINVAL.
For example, if we are relocating a data block group X that has a logical
address of X and the block group has an extent allocated at the logical
address X + 128KiB with a size of 64KiB:
1) At prealloc_file_extent_cluster() we allocate an extent for the data
relocation inode with a size of 64KiB and associate it to the file
offset 128KiB (X + 128KiB - X) of the data relocation inode. This
preallocated extent was allocated at block group Z;
2) A scrub running in parallel turns block group Z into RO mode and
starts scrubing its extents;
3) Relocation triggers writeback for the data relocation inode;
4) When running delalloc (btrfs_run_delalloc_range()), we try first the
NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC
set in its flags. However, because block group Z is in RO mode, the
NOCOW path (run_delalloc_nocow()) falls back into the COW path, by
calling cow_file_range();
5) At cow_file_range(), in the first iteration of the while loop we call
btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum
allocation size of 4KiB (fs_info->sectorsize). Due to free space
fragmentation, btrfs_reserve_extent() ends up allocating two extents
of 32KiB each, each one on a different iteration of that while loop;
6) Writeback of the data relocation inode completes;
7) Relocation proceeds and ends up at relocation.c:replace_file_extents(),
with a leaf which has a file extent item that points to the data extent
from block group X, that has a logical address (bytenr) of X + 128KiB
and a size of 64KiB. Then it calls get_new_location(), which does a
lookup in the data relocation tree for a file extent item starting at
offset 128KiB (X + 128KiB - X) and belonging to the data relocation
inode. It finds a corresponding file extent item, however that item
points to an extent that has a size of 32KiB, which doesn't match the
expected size of 64KiB, resuling in -EINVAL being returned from this
function and propagated up to __btrfs_cow_block(), which aborts the
current transaction.
To fix this make sure that at cow_file_range() when we call the allocator
we pass it a minimum allocation size corresponding the desired extent size
if the inode belongs to the data relocation tree, otherwise pass it the
filesystem's sector size as the minimum allocation size.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:32:55 +01:00
|
|
|
min_alloc_size, 0, alloc_hint,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
&ins, 1, 1);
|
2013-08-14 14:02:47 -04:00
|
|
|
if (ret < 0)
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out_unlock;
|
2017-03-06 23:04:20 +00:00
|
|
|
cur_alloc_size = ins.offset;
|
|
|
|
|
extent_reserved = true;
|
2009-01-05 21:25:51 -05:00
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
ram_size = ins.offset;
|
2020-06-03 08:55:14 +03:00
|
|
|
em = create_io_em(inode, start, ins.offset, /* len */
|
2017-01-31 07:50:22 -08:00
|
|
|
start, /* orig_start */
|
|
|
|
|
ins.objectid, /* block_start */
|
|
|
|
|
ins.offset, /* block_len */
|
|
|
|
|
ins.offset, /* orig_block_len */
|
|
|
|
|
ram_size, /* ram_bytes */
|
|
|
|
|
BTRFS_COMPRESS_NONE, /* compress_type */
|
2017-02-13 15:35:09 -08:00
|
|
|
BTRFS_ORDERED_REGULAR /* type */);
|
2018-05-30 16:48:56 +08:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
2013-04-22 10:53:47 +00:00
|
|
|
goto out_reserve;
|
2018-05-30 16:48:56 +08:00
|
|
|
}
|
2017-01-31 07:50:22 -08:00
|
|
|
free_extent_map(em);
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2019-11-06 12:11:56 -08:00
|
|
|
ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
|
|
|
|
|
ins.objectid, cur_alloc_size, 0,
|
|
|
|
|
1 << BTRFS_ORDERED_REGULAR,
|
|
|
|
|
BTRFS_COMPRESS_NONE);
|
2013-04-22 10:53:47 +00:00
|
|
|
if (ret)
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 10:43:00 +01:00
|
|
|
goto out_drop_extent_cache;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2021-09-09 01:19:25 +09:00
|
|
|
if (btrfs_is_data_reloc_root(root)) {
|
2020-06-03 08:55:14 +03:00
|
|
|
ret = btrfs_reloc_clone_csums(inode, start,
|
2008-12-12 10:03:38 -05:00
|
|
|
cur_alloc_size);
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 10:25:51 +08:00
|
|
|
/*
|
|
|
|
|
* Only drop cache here, and process as normal.
|
|
|
|
|
*
|
|
|
|
|
* We must not allow extent_clear_unlock_delalloc()
|
|
|
|
|
* at out_unlock label to free meta of this ordered
|
|
|
|
|
* extent, as its meta should be freed by
|
|
|
|
|
* btrfs_finish_ordered_io().
|
|
|
|
|
*
|
|
|
|
|
* So we must continue until @start is increased to
|
|
|
|
|
* skip current ordered extent.
|
|
|
|
|
*/
|
2013-08-14 14:02:47 -04:00
|
|
|
if (ret)
|
2020-06-03 08:55:14 +03:00
|
|
|
btrfs_drop_extent_cache(inode, start,
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 10:25:51 +08:00
|
|
|
start + ram_size - 1, 0);
|
2008-12-12 10:03:38 -05:00
|
|
|
}
|
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 15:39:32 +01:00
|
|
|
|
2021-04-07 19:22:13 +08:00
|
|
|
/*
|
|
|
|
|
* We're not doing compressed IO, don't unlock the first page
|
|
|
|
|
* (which the caller expects to stay locked), don't clear any
|
|
|
|
|
* dirty bits and don't set any writeback bits
|
2009-09-02 16:53:46 -04:00
|
|
|
*
|
2021-04-07 19:22:13 +08:00
|
|
|
* Do set the Ordered (Private2) bit so we know this page was
|
|
|
|
|
* properly setup for writepage.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
*/
|
2017-03-06 23:04:20 +00:00
|
|
|
page_ops = unlock ? PAGE_UNLOCK : 0;
|
2021-04-07 19:22:13 +08:00
|
|
|
page_ops |= PAGE_SET_ORDERED;
|
2009-10-08 11:27:10 -04:00
|
|
|
|
2020-06-03 08:55:14 +03:00
|
|
|
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
|
2019-07-17 16:18:16 +03:00
|
|
|
locked_page,
|
2013-07-29 11:20:47 -04:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC,
|
2017-03-06 23:04:20 +00:00
|
|
|
page_ops);
|
2018-02-15 12:29:38 +08:00
|
|
|
if (num_bytes < cur_alloc_size)
|
|
|
|
|
num_bytes = 0;
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 10:25:51 +08:00
|
|
|
else
|
2018-02-15 12:29:38 +08:00
|
|
|
num_bytes -= cur_alloc_size;
|
2007-12-17 20:14:04 -05:00
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
|
start += cur_alloc_size;
|
2017-03-06 23:04:20 +00:00
|
|
|
extent_reserved = false;
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 10:25:51 +08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* btrfs_reloc_clone_csums() error, since start is increased
|
|
|
|
|
* extent_clear_unlock_delalloc() at out_unlock label won't
|
|
|
|
|
* free metadata of current ordered extent, we're OK to exit.
|
|
|
|
|
*/
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_unlock;
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
2012-03-12 16:03:00 +01:00
|
|
|
out:
|
2007-12-17 20:14:01 -05:00
|
|
|
return ret;
|
2012-11-01 07:32:18 +00:00
|
|
|
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 10:43:00 +01:00
|
|
|
out_drop_extent_cache:
|
2020-06-03 08:55:14 +03:00
|
|
|
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
|
2013-04-22 10:53:47 +00:00
|
|
|
out_reserve:
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
|
2012-03-12 16:03:00 +01:00
|
|
|
out_unlock:
|
2022-07-09 08:18:49 +09:00
|
|
|
/*
|
|
|
|
|
* If done_offset is non-NULL and ret == -EAGAIN, we expect the
|
|
|
|
|
* caller to write out the successfully allocated region and retry.
|
|
|
|
|
*/
|
|
|
|
|
if (done_offset && ret == -EAGAIN) {
|
|
|
|
|
if (orig_start < start)
|
|
|
|
|
*done_offset = start - 1;
|
|
|
|
|
else
|
|
|
|
|
*done_offset = start;
|
|
|
|
|
return ret;
|
|
|
|
|
} else if (ret == -EAGAIN) {
|
|
|
|
|
/* Convert to -ENOSPC since the caller cannot retry. */
|
|
|
|
|
ret = -ENOSPC;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
/*
|
|
|
|
|
* Now, we have three regions to clean up:
|
|
|
|
|
*
|
|
|
|
|
* |-------(1)----|---(2)---|-------------(3)----------|
|
|
|
|
|
* `- orig_start `- start `- start + cur_alloc_size `- end
|
|
|
|
|
*
|
|
|
|
|
* We process each region below.
|
|
|
|
|
*/
|
|
|
|
|
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
|
|
|
|
|
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
|
2021-01-26 16:33:45 +08:00
|
|
|
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
|
2017-03-06 23:04:20 +00:00
|
|
|
/*
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
* For the range (1). We have already instantiated the ordered extents
|
|
|
|
|
* for this region. They are cleaned up by
|
|
|
|
|
* btrfs_cleanup_ordered_extents() in e.g,
|
|
|
|
|
* btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
|
|
|
|
|
* already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
|
|
|
|
|
* EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
|
|
|
|
|
* function.
|
|
|
|
|
*
|
|
|
|
|
* However, in case of unlock == 0, we still need to unlock the pages
|
|
|
|
|
* (except @locked_page) to ensure all the pages are unlocked.
|
|
|
|
|
*/
|
2022-06-21 15:41:01 +09:00
|
|
|
if (!unlock && orig_start < start) {
|
|
|
|
|
if (!locked_page)
|
|
|
|
|
mapping_set_error(inode->vfs_inode.i_mapping, ret);
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
|
|
|
|
|
locked_page, 0, page_ops);
|
2022-06-21 15:41:01 +09:00
|
|
|
}
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
|
2017-03-06 23:04:20 +00:00
|
|
|
/*
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
* For the range (2). If we reserved an extent for our delalloc range
|
|
|
|
|
* (or a subrange) and failed to create the respective ordered extent,
|
|
|
|
|
* then it means that when we reserved the extent we decremented the
|
|
|
|
|
* extent's size from the data space_info's bytes_may_use counter and
|
|
|
|
|
* incremented the space_info's bytes_reserved counter by the same
|
|
|
|
|
* amount. We must make sure extent_clear_unlock_delalloc() does not try
|
|
|
|
|
* to decrement again the data space_info's bytes_may_use counter,
|
|
|
|
|
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
|
2017-03-06 23:04:20 +00:00
|
|
|
*/
|
|
|
|
|
if (extent_reserved) {
|
2020-06-03 08:55:14 +03:00
|
|
|
extent_clear_unlock_delalloc(inode, start,
|
2020-05-27 11:15:53 +01:00
|
|
|
start + cur_alloc_size - 1,
|
2017-03-06 23:04:20 +00:00
|
|
|
locked_page,
|
|
|
|
|
clear_bits,
|
|
|
|
|
page_ops);
|
|
|
|
|
start += cur_alloc_size;
|
|
|
|
|
if (start >= end)
|
2022-06-21 15:41:02 +09:00
|
|
|
return ret;
|
2017-03-06 23:04:20 +00:00
|
|
|
}
|
btrfs: ensure pages are unlocked on cow_file_range() failure
There is a hung_task report on zoned btrfs like below.
https://github.com/naota/linux/issues/59
[726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
[726.329839] Not tainted 5.16.0-rc1+ #1
[726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
[726.331608] Call Trace:
[726.331611] <TASK>
[726.331614] __schedule+0x2e5/0x9d0
[726.331622] schedule+0x58/0xd0
[726.331626] io_schedule+0x3f/0x70
[726.331629] __folio_lock+0x125/0x200
[726.331634] ? find_get_entries+0x1bc/0x240
[726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
[726.331642] truncate_inode_pages_range+0x5b2/0x770
[726.331649] truncate_inode_pages_final+0x44/0x50
[726.331653] btrfs_evict_inode+0x67/0x480
[726.331658] evict+0xd0/0x180
[726.331661] iput+0x13f/0x200
[726.331664] do_unlinkat+0x1c0/0x2b0
[726.331668] __x64_sys_unlink+0x23/0x30
[726.331670] do_syscall_64+0x3b/0xc0
[726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
[726.331677] RIP: 0033:0x7fb9490a171b
[726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
[726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
[726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
[726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
[726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
[726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
[726.331693] </TASK>
While we debug the issue, we found running fstests generic/551 on 5GB
non-zoned null_blk device in the emulated zoned mode also had a
similar hung issue.
Also, we can reproduce the same symptom with an error injected
cow_file_range() setup.
The hang occurs when cow_file_range() fails in the middle of
allocation. cow_file_range() called from do_allocation_zoned() can
split the give region ([start, end]) for allocation depending on
current block group usages. When btrfs can allocate bytes for one part
of the split regions but fails for the other region (e.g. because of
-ENOSPC), we return the error leaving the pages in the succeeded regions
locked. Technically, this occurs only when @unlock == 0. Otherwise, we
unlock the pages in an allocated region after creating an ordered
extent.
Considering the callers of cow_file_range(unlock=0) won't write out
the pages, we can unlock the pages on error exit from
cow_file_range(). So, we can ensure all the pages except @locked_page
are unlocked on error case.
In summary, cow_file_range now behaves like this:
- page_started == 1 (return value)
- All the pages are unlocked. IO is started.
- unlock == 1
- All the pages except @locked_page are unlocked in any case
- unlock == 0
- On success, all the pages are locked for writing out them
- On failure, all the pages except @locked_page are unlocked
Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 15:40:59 +09:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* For the range (3). We never touched the region. In addition to the
|
|
|
|
|
* clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
|
|
|
|
|
* space_info's bytes_may_use counter, reserved in
|
|
|
|
|
* btrfs_check_data_free_space().
|
|
|
|
|
*/
|
2020-06-03 08:55:14 +03:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, locked_page,
|
2017-03-06 23:04:20 +00:00
|
|
|
clear_bits | EXTENT_CLEAR_DATA_RESV,
|
|
|
|
|
page_ops);
|
2022-06-21 15:41:02 +09:00
|
|
|
return ret;
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
/*
|
|
|
|
|
* work queue call back to started compression on a file and pages
|
|
|
|
|
*/
|
|
|
|
|
static noinline void async_cow_start(struct btrfs_work *work)
|
|
|
|
|
{
|
2019-03-12 17:20:25 +02:00
|
|
|
struct async_chunk *async_chunk;
|
2019-07-17 14:41:44 +03:00
|
|
|
int compressed_extents;
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-03-12 17:20:25 +02:00
|
|
|
async_chunk = container_of(work, struct async_chunk, work);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-07-17 14:41:44 +03:00
|
|
|
compressed_extents = compress_file_range(async_chunk);
|
|
|
|
|
if (compressed_extents == 0) {
|
2019-03-12 17:20:25 +02:00
|
|
|
btrfs_add_delayed_iput(async_chunk->inode);
|
|
|
|
|
async_chunk->inode = NULL;
|
2012-06-08 15:16:12 -04:00
|
|
|
}
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* work queue call back to submit previously compressed pages
|
|
|
|
|
*/
|
|
|
|
|
static noinline void async_cow_submit(struct btrfs_work *work)
|
|
|
|
|
{
|
2019-03-12 17:20:26 +02:00
|
|
|
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
|
|
|
|
|
work);
|
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
|
2008-11-06 22:02:51 -05:00
|
|
|
unsigned long nr_pages;
|
|
|
|
|
|
2019-03-12 17:20:25 +02:00
|
|
|
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
PAGE_SHIFT;
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-01-03 10:50:03 +02:00
|
|
|
/*
|
2019-03-12 17:20:25 +02:00
|
|
|
* ->inode could be NULL if async_chunk_start has failed to compress,
|
2019-01-03 10:50:03 +02:00
|
|
|
* in which case we don't have anything to submit, yet we need to
|
|
|
|
|
* always adjust ->async_delalloc_pages as its paired with the init
|
|
|
|
|
* happening in cow_file_range_async
|
|
|
|
|
*/
|
2019-03-12 17:20:25 +02:00
|
|
|
if (async_chunk->inode)
|
|
|
|
|
submit_compressed_extents(async_chunk);
|
2021-07-14 14:47:17 -04:00
|
|
|
|
|
|
|
|
/* atomic_sub_return implies a barrier */
|
|
|
|
|
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
|
|
|
|
|
5 * SZ_1M)
|
|
|
|
|
cond_wake_up_nomb(&fs_info->async_submit_wait);
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
2008-11-06 22:02:51 -05:00
|
|
|
static noinline void async_cow_free(struct btrfs_work *work)
|
|
|
|
|
{
|
2019-03-12 17:20:25 +02:00
|
|
|
struct async_chunk *async_chunk;
|
2021-09-27 15:21:45 +08:00
|
|
|
struct async_cow *async_cow;
|
2019-03-12 17:20:24 +02:00
|
|
|
|
2019-03-12 17:20:25 +02:00
|
|
|
async_chunk = container_of(work, struct async_chunk, work);
|
|
|
|
|
if (async_chunk->inode)
|
|
|
|
|
btrfs_add_delayed_iput(async_chunk->inode);
|
2019-07-10 12:28:17 -07:00
|
|
|
if (async_chunk->blkcg_css)
|
|
|
|
|
css_put(async_chunk->blkcg_css);
|
2021-09-27 15:21:45 +08:00
|
|
|
|
|
|
|
|
async_cow = async_chunk->async_cow;
|
|
|
|
|
if (atomic_dec_and_test(&async_cow->num_chunks))
|
|
|
|
|
kvfree(async_cow);
|
2008-11-06 22:02:51 -05:00
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:22 +03:00
|
|
|
static int cow_file_range_async(struct btrfs_inode *inode,
|
2019-07-10 12:28:17 -07:00
|
|
|
struct writeback_control *wbc,
|
|
|
|
|
struct page *locked_page,
|
2008-11-06 22:02:51 -05:00
|
|
|
u64 start, u64 end, int *page_started,
|
2019-10-29 18:28:57 +01:00
|
|
|
unsigned long *nr_written)
|
2008-11-06 22:02:51 -05:00
|
|
|
{
|
2020-06-03 08:55:22 +03:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
2019-07-10 12:28:17 -07:00
|
|
|
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
|
2019-03-12 17:20:24 +02:00
|
|
|
struct async_cow *ctx;
|
|
|
|
|
struct async_chunk *async_chunk;
|
2008-11-06 22:02:51 -05:00
|
|
|
unsigned long nr_pages;
|
|
|
|
|
u64 cur_end;
|
2019-03-12 17:20:24 +02:00
|
|
|
u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
|
|
|
|
|
int i;
|
|
|
|
|
bool should_compress;
|
2019-04-01 11:29:57 +03:00
|
|
|
unsigned nofs_flag;
|
2022-07-14 11:07:16 -07:00
|
|
|
const blk_opf_t write_flags = wbc_to_write_flags(wbc);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2020-06-03 08:55:22 +03:00
|
|
|
unlock_extent(&inode->io_tree, start, end);
|
2019-03-12 17:20:24 +02:00
|
|
|
|
2020-06-03 08:55:22 +03:00
|
|
|
if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
|
2019-03-12 17:20:24 +02:00
|
|
|
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
|
|
|
|
|
num_chunks = 1;
|
|
|
|
|
should_compress = false;
|
|
|
|
|
} else {
|
|
|
|
|
should_compress = true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-01 11:29:57 +03:00
|
|
|
nofs_flag = memalloc_nofs_save();
|
|
|
|
|
ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
|
|
|
|
|
memalloc_nofs_restore(nofs_flag);
|
|
|
|
|
|
2019-03-12 17:20:24 +02:00
|
|
|
if (!ctx) {
|
|
|
|
|
unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
|
|
|
|
|
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
|
|
|
|
|
EXTENT_DO_ACCOUNTING;
|
2021-01-26 16:33:45 +08:00
|
|
|
unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
|
|
|
|
|
PAGE_END_WRITEBACK | PAGE_SET_ERROR;
|
2019-03-12 17:20:24 +02:00
|
|
|
|
2020-06-03 08:55:22 +03:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, locked_page,
|
|
|
|
|
clear_bits, page_ops);
|
2019-03-12 17:20:24 +02:00
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async_chunk = ctx->chunks;
|
|
|
|
|
atomic_set(&ctx->num_chunks, num_chunks);
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < num_chunks; i++) {
|
|
|
|
|
if (should_compress)
|
|
|
|
|
cur_end = min(end, start + SZ_512K - 1);
|
|
|
|
|
else
|
|
|
|
|
cur_end = end;
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-01-03 10:50:01 +02:00
|
|
|
/*
|
|
|
|
|
* igrab is called higher up in the call chain, take only the
|
|
|
|
|
* lightweight reference for the callback lifetime
|
|
|
|
|
*/
|
2020-06-03 08:55:22 +03:00
|
|
|
ihold(&inode->vfs_inode);
|
2021-09-27 15:21:45 +08:00
|
|
|
async_chunk[i].async_cow = ctx;
|
2020-06-03 08:55:22 +03:00
|
|
|
async_chunk[i].inode = &inode->vfs_inode;
|
2019-03-12 17:20:24 +02:00
|
|
|
async_chunk[i].start = start;
|
|
|
|
|
async_chunk[i].end = cur_end;
|
|
|
|
|
async_chunk[i].write_flags = write_flags;
|
|
|
|
|
INIT_LIST_HEAD(&async_chunk[i].extents);
|
|
|
|
|
|
Btrfs: only associate the locked page with one async_chunk struct
The btrfs writepages function collects a large range of pages flagged
for delayed allocation, and then sends them down through the COW code
for processing. When compression is on, we allocate one async_chunk
structure for every 512K, and then run those pages through the
compression code for IO submission.
writepages starts all of this off with a single page, locked by the
original call to extent_write_cache_pages(), and it's important to keep
track of this page because it has already been through
clear_page_dirty_for_io().
The btrfs async_chunk struct has a pointer to the locked_page, and when
we're redirtying the page because compression had to fallback to
uncompressed IO, we use page->index to decide if a given async_chunk
struct really owns that page.
But, this is racey. If a given delalloc range is broken up into two
async_chunks (chunkA and chunkB), we can end up with something like
this:
compress_file_range(chunkA)
submit_compress_extents(chunkA)
submit compressed bios(chunkA)
put_page(locked_page)
compress_file_range(chunkB)
...
Or:
async_cow_submit
submit_compressed_extents <--- falls back to buffered writeout
cow_file_range
extent_clear_unlock_delalloc
__process_pages_contig
put_page(locked_pages)
async_cow_submit
The end result is that chunkA is completed and cleaned up before chunkB
even starts processing. This means we can free locked_page() and reuse
it elsewhere. If we get really lucky, it'll have the same page->index
in its new home as it did before.
While we're processing chunkB, we might decide we need to fall back to
uncompressed IO, and so compress_file_range() will call
__set_page_dirty_nobufers() on chunkB->locked_page.
Without cgroups in use, this creates as a phantom dirty page, which
isn't great but isn't the end of the world. What can happen, it can go
through the fixup worker and the whole COW machinery again:
in submit_compressed_extents():
while (async extents) {
...
cow_file_range
if (!page_started ...)
extent_write_locked_range
else if (...)
unlock_page
continue;
This hasn't been observed in practice but is still possible.
With cgroups in use, we might crash in the accounting code because
page->mapping->i_wb isn't set.
BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0
IP: percpu_counter_add_batch+0x11/0x70
PGD 66534e067 P4D 66534e067 PUD 66534f067 PMD 0
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 16 PID: 2172 Comm: rm Not tainted
RIP: 0010:percpu_counter_add_batch+0x11/0x70
RSP: 0018:ffffc9000a97bbe0 EFLAGS: 00010286
RAX: 0000000000000005 RBX: 0000000000000090 RCX: 0000000000026115
RDX: 0000000000000030 RSI: ffffffffffffffff RDI: 0000000000000090
RBP: 0000000000000000 R08: fffffffffffffff5 R09: 0000000000000000
R10: 00000000000260c0 R11: ffff881037fc26c0 R12: ffffffffffffffff
R13: ffff880fe4111548 R14: ffffc9000a97bc90 R15: 0000000000000001
FS: 00007f5503ced480(0000) GS:ffff880ff7200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000000d0 CR3: 00000001e0459005 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
account_page_cleaned+0x15b/0x1f0
__cancel_dirty_page+0x146/0x200
truncate_cleanup_page+0x92/0xb0
truncate_inode_pages_range+0x202/0x7d0
btrfs_evict_inode+0x92/0x5a0
evict+0xc1/0x190
do_unlinkat+0x176/0x280
do_syscall_64+0x63/0x1a0
entry_SYSCALL_64_after_hwframe+0x42/0xb7
The fix here is to make asyc_chunk->locked_page NULL everywhere but the
one async_chunk struct that's allowed to do things to the locked page.
Link: https://lore.kernel.org/linux-btrfs/c2419d01-5c84-3fb4-189e-4db519d08796@suse.com/
Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Chris Mason <clm@fb.com>
[ update changelog from mail thread discussion ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-10 12:28:16 -07:00
|
|
|
/*
|
|
|
|
|
* The locked_page comes all the way from writepage and its
|
|
|
|
|
* the original page we were actually given. As we spread
|
|
|
|
|
* this large delalloc region across multiple async_chunk
|
|
|
|
|
* structs, only the first struct needs a pointer to locked_page
|
|
|
|
|
*
|
|
|
|
|
* This way we don't need racey decisions about who is supposed
|
|
|
|
|
* to unlock it.
|
|
|
|
|
*/
|
|
|
|
|
if (locked_page) {
|
2019-07-10 12:28:17 -07:00
|
|
|
/*
|
|
|
|
|
* Depending on the compressibility, the pages might or
|
|
|
|
|
* might not go through async. We want all of them to
|
|
|
|
|
* be accounted against wbc once. Let's do it here
|
|
|
|
|
* before the paths diverge. wbc accounting is used
|
|
|
|
|
* only for foreign writeback detection and doesn't
|
|
|
|
|
* need full accuracy. Just account the whole thing
|
|
|
|
|
* against the first page.
|
|
|
|
|
*/
|
|
|
|
|
wbc_account_cgroup_owner(wbc, locked_page,
|
|
|
|
|
cur_end - start);
|
Btrfs: only associate the locked page with one async_chunk struct
The btrfs writepages function collects a large range of pages flagged
for delayed allocation, and then sends them down through the COW code
for processing. When compression is on, we allocate one async_chunk
structure for every 512K, and then run those pages through the
compression code for IO submission.
writepages starts all of this off with a single page, locked by the
original call to extent_write_cache_pages(), and it's important to keep
track of this page because it has already been through
clear_page_dirty_for_io().
The btrfs async_chunk struct has a pointer to the locked_page, and when
we're redirtying the page because compression had to fallback to
uncompressed IO, we use page->index to decide if a given async_chunk
struct really owns that page.
But, this is racey. If a given delalloc range is broken up into two
async_chunks (chunkA and chunkB), we can end up with something like
this:
compress_file_range(chunkA)
submit_compress_extents(chunkA)
submit compressed bios(chunkA)
put_page(locked_page)
compress_file_range(chunkB)
...
Or:
async_cow_submit
submit_compressed_extents <--- falls back to buffered writeout
cow_file_range
extent_clear_unlock_delalloc
__process_pages_contig
put_page(locked_pages)
async_cow_submit
The end result is that chunkA is completed and cleaned up before chunkB
even starts processing. This means we can free locked_page() and reuse
it elsewhere. If we get really lucky, it'll have the same page->index
in its new home as it did before.
While we're processing chunkB, we might decide we need to fall back to
uncompressed IO, and so compress_file_range() will call
__set_page_dirty_nobufers() on chunkB->locked_page.
Without cgroups in use, this creates as a phantom dirty page, which
isn't great but isn't the end of the world. What can happen, it can go
through the fixup worker and the whole COW machinery again:
in submit_compressed_extents():
while (async extents) {
...
cow_file_range
if (!page_started ...)
extent_write_locked_range
else if (...)
unlock_page
continue;
This hasn't been observed in practice but is still possible.
With cgroups in use, we might crash in the accounting code because
page->mapping->i_wb isn't set.
BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0
IP: percpu_counter_add_batch+0x11/0x70
PGD 66534e067 P4D 66534e067 PUD 66534f067 PMD 0
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 16 PID: 2172 Comm: rm Not tainted
RIP: 0010:percpu_counter_add_batch+0x11/0x70
RSP: 0018:ffffc9000a97bbe0 EFLAGS: 00010286
RAX: 0000000000000005 RBX: 0000000000000090 RCX: 0000000000026115
RDX: 0000000000000030 RSI: ffffffffffffffff RDI: 0000000000000090
RBP: 0000000000000000 R08: fffffffffffffff5 R09: 0000000000000000
R10: 00000000000260c0 R11: ffff881037fc26c0 R12: ffffffffffffffff
R13: ffff880fe4111548 R14: ffffc9000a97bc90 R15: 0000000000000001
FS: 00007f5503ced480(0000) GS:ffff880ff7200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000000d0 CR3: 00000001e0459005 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
account_page_cleaned+0x15b/0x1f0
__cancel_dirty_page+0x146/0x200
truncate_cleanup_page+0x92/0xb0
truncate_inode_pages_range+0x202/0x7d0
btrfs_evict_inode+0x92/0x5a0
evict+0xc1/0x190
do_unlinkat+0x176/0x280
do_syscall_64+0x63/0x1a0
entry_SYSCALL_64_after_hwframe+0x42/0xb7
The fix here is to make asyc_chunk->locked_page NULL everywhere but the
one async_chunk struct that's allowed to do things to the locked page.
Link: https://lore.kernel.org/linux-btrfs/c2419d01-5c84-3fb4-189e-4db519d08796@suse.com/
Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Chris Mason <clm@fb.com>
[ update changelog from mail thread discussion ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-10 12:28:16 -07:00
|
|
|
async_chunk[i].locked_page = locked_page;
|
|
|
|
|
locked_page = NULL;
|
|
|
|
|
} else {
|
|
|
|
|
async_chunk[i].locked_page = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-10 12:28:17 -07:00
|
|
|
if (blkcg_css != blkcg_root_css) {
|
|
|
|
|
css_get(blkcg_css);
|
|
|
|
|
async_chunk[i].blkcg_css = blkcg_css;
|
|
|
|
|
} else {
|
|
|
|
|
async_chunk[i].blkcg_css = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-16 11:30:57 -07:00
|
|
|
btrfs_init_work(&async_chunk[i].work, async_cow_start,
|
|
|
|
|
async_cow_submit, async_cow_free);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-03-12 17:20:24 +02:00
|
|
|
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
|
2016-06-22 18:54:23 -04:00
|
|
|
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2019-03-12 17:20:24 +02:00
|
|
|
btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
|
2008-11-06 22:02:51 -05:00
|
|
|
|
|
|
|
|
*nr_written += nr_pages;
|
|
|
|
|
start = cur_end + 1;
|
|
|
|
|
}
|
|
|
|
|
*page_started = 1;
|
|
|
|
|
return 0;
|
2007-12-17 20:14:01 -05:00
|
|
|
}
|
|
|
|
|
|
2021-02-04 19:22:07 +09:00
|
|
|
static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
|
|
|
|
|
struct page *locked_page, u64 start,
|
|
|
|
|
u64 end, int *page_started,
|
|
|
|
|
unsigned long *nr_written)
|
|
|
|
|
{
|
2022-07-09 08:18:49 +09:00
|
|
|
u64 done_offset = end;
|
2021-02-04 19:22:07 +09:00
|
|
|
int ret;
|
2022-07-09 08:18:49 +09:00
|
|
|
bool locked_page_done = false;
|
2021-02-04 19:22:07 +09:00
|
|
|
|
2022-07-09 08:18:49 +09:00
|
|
|
while (start <= end) {
|
|
|
|
|
ret = cow_file_range(inode, locked_page, start, end, page_started,
|
|
|
|
|
nr_written, 0, &done_offset);
|
|
|
|
|
if (ret && ret != -EAGAIN)
|
|
|
|
|
return ret;
|
2021-02-04 19:22:07 +09:00
|
|
|
|
2022-07-09 08:18:49 +09:00
|
|
|
if (*page_started) {
|
|
|
|
|
ASSERT(ret == 0);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ret == 0)
|
|
|
|
|
done_offset = end;
|
|
|
|
|
|
2022-07-09 08:18:50 +09:00
|
|
|
if (done_offset == start) {
|
2022-08-31 13:55:48 +09:00
|
|
|
wait_on_bit_io(&inode->root->fs_info->flags,
|
|
|
|
|
BTRFS_FS_NEED_ZONE_FINISH,
|
|
|
|
|
TASK_UNINTERRUPTIBLE);
|
2022-07-09 08:18:50 +09:00
|
|
|
continue;
|
|
|
|
|
}
|
2022-07-09 08:18:49 +09:00
|
|
|
|
|
|
|
|
if (!locked_page_done) {
|
|
|
|
|
__set_page_dirty_nobuffers(locked_page);
|
|
|
|
|
account_page_redirty(locked_page);
|
|
|
|
|
}
|
|
|
|
|
locked_page_done = true;
|
|
|
|
|
extent_write_locked_range(&inode->vfs_inode, start, done_offset);
|
|
|
|
|
|
|
|
|
|
start = done_offset + 1;
|
|
|
|
|
}
|
2021-02-04 19:22:07 +09:00
|
|
|
|
|
|
|
|
*page_started = 1;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
|
2008-12-12 10:03:38 -05:00
|
|
|
u64 bytenr, u64 num_bytes)
|
|
|
|
|
{
|
2021-11-05 16:45:48 -04:00
|
|
|
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
|
2008-12-12 10:03:38 -05:00
|
|
|
struct btrfs_ordered_sum *sums;
|
2021-11-05 16:45:48 -04:00
|
|
|
int ret;
|
2008-12-12 10:03:38 -05:00
|
|
|
LIST_HEAD(list);
|
|
|
|
|
|
2021-11-05 16:45:48 -04:00
|
|
|
ret = btrfs_lookup_csums_range(csum_root, bytenr,
|
2011-03-08 14:14:00 +01:00
|
|
|
bytenr + num_bytes - 1, &list, 0);
|
2008-12-12 10:03:38 -05:00
|
|
|
if (ret == 0 && list_empty(&list))
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
while (!list_empty(&list)) {
|
|
|
|
|
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
|
|
|
|
|
list_del(&sums->list);
|
|
|
|
|
kfree(sums);
|
|
|
|
|
}
|
2018-01-31 17:09:13 -07:00
|
|
|
if (ret < 0)
|
|
|
|
|
return ret;
|
2008-12-12 10:03:38 -05:00
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:20 +03:00
|
|
|
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
const u64 start, const u64 end,
|
|
|
|
|
int *page_started, unsigned long *nr_written)
|
|
|
|
|
{
|
2020-06-03 08:55:20 +03:00
|
|
|
const bool is_space_ino = btrfs_is_free_space_inode(inode);
|
2021-09-09 01:19:25 +09:00
|
|
|
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
|
2020-05-27 11:16:19 +01:00
|
|
|
const u64 range_bytes = end + 1 - start;
|
2020-06-03 08:55:20 +03:00
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
u64 range_start = start;
|
|
|
|
|
u64 count;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If EXTENT_NORESERVE is set it means that when the buffered write was
|
|
|
|
|
* made we had not enough available data space and therefore we did not
|
|
|
|
|
* reserve data space for it, since we though we could do NOCOW for the
|
|
|
|
|
* respective file range (either there is prealloc extent or the inode
|
|
|
|
|
* has the NOCOW bit set).
|
|
|
|
|
*
|
|
|
|
|
* However when we need to fallback to COW mode (because for example the
|
|
|
|
|
* block group for the corresponding extent was turned to RO mode by a
|
|
|
|
|
* scrub or relocation) we need to do the following:
|
|
|
|
|
*
|
|
|
|
|
* 1) We increment the bytes_may_use counter of the data space info.
|
|
|
|
|
* If COW succeeds, it allocates a new data extent and after doing
|
|
|
|
|
* that it decrements the space info's bytes_may_use counter and
|
|
|
|
|
* increments its bytes_reserved counter by the same amount (we do
|
|
|
|
|
* this at btrfs_add_reserved_bytes()). So we need to increment the
|
|
|
|
|
* bytes_may_use counter to compensate (when space is reserved at
|
|
|
|
|
* buffered write time, the bytes_may_use counter is incremented);
|
|
|
|
|
*
|
|
|
|
|
* 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
|
|
|
|
|
* that if the COW path fails for any reason, it decrements (through
|
|
|
|
|
* extent_clear_unlock_delalloc()) the bytes_may_use counter of the
|
|
|
|
|
* data space info, which we incremented in the step above.
|
2020-05-27 11:16:19 +01:00
|
|
|
*
|
|
|
|
|
* If we need to fallback to cow and the inode corresponds to a free
|
btrfs: fix bytes_may_use underflow when running balance and scrub in parallel
When balance and scrub are running in parallel it is possible to end up
with an underflow of the bytes_may_use counter of the data space_info
object, which triggers a warning like the following:
[134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data
[134243.806891] ------------[ cut here ]------------
[134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.808819] Modules linked in: btrfs blake2b_generic xor (...)
[134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483)
[134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.819963] Code: 0b f2 85 (...)
[134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287
[134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000
[134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810
[134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000
[134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000
[134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810
[134243.827432] FS: 0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000
[134243.828451] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0
[134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134243.831867] Call Trace:
[134243.832211] find_free_extent+0x4a0/0x16c0 [btrfs]
[134243.832846] btrfs_reserve_extent+0x91/0x180 [btrfs]
[134243.833487] cow_file_range+0x12d/0x490 [btrfs]
[134243.834080] fallback_to_cow+0x82/0x1b0 [btrfs]
[134243.834689] ? release_extent_buffer+0x121/0x170 [btrfs]
[134243.835370] run_delalloc_nocow+0x33f/0xa30 [btrfs]
[134243.836032] btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
[134243.836725] ? find_lock_delalloc_range+0x221/0x250 [btrfs]
[134243.837450] writepage_delalloc+0xe8/0x150 [btrfs]
[134243.838059] __extent_writepage+0xe8/0x4c0 [btrfs]
[134243.838674] extent_write_cache_pages+0x237/0x530 [btrfs]
[134243.839364] extent_writepages+0x44/0xa0 [btrfs]
[134243.839946] do_writepages+0x23/0x80
[134243.840401] __writeback_single_inode+0x59/0x700
[134243.841006] writeback_sb_inodes+0x267/0x5f0
[134243.841548] __writeback_inodes_wb+0x87/0xe0
[134243.842091] wb_writeback+0x382/0x590
[134243.842574] ? wb_workfn+0x4a2/0x6c0
[134243.843030] wb_workfn+0x4a2/0x6c0
[134243.843468] process_one_work+0x26d/0x6a0
[134243.843978] worker_thread+0x4f/0x3e0
[134243.844452] ? process_one_work+0x6a0/0x6a0
[134243.844981] kthread+0x103/0x140
[134243.845400] ? kthread_create_worker_on_cpu+0x70/0x70
[134243.846030] ret_from_fork+0x3a/0x50
[134243.846494] irq event stamp: 0
[134243.846892] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134243.847682] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.848687] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134243.850698] ---[ end trace bd7c03622e0b0a96 ]---
[134243.851335] ------------[ cut here ]------------
When relocating a data block group, for each extent allocated in the
block group we preallocate another extent with the same size for the
data relocation inode (we do it at prealloc_file_extent_cluster()).
We reserve space by calling btrfs_check_data_free_space(), which ends
up incrementing the data space_info's bytes_may_use counter, and
then call btrfs_prealloc_file_range() to allocate the extent, which
always decrements the bytes_may_use counter by the same amount.
The expectation is that writeback of the data relocation inode always
follows a NOCOW path, by writing into the preallocated extents. However,
when starting writeback we might end up falling back into the COW path,
because the block group that contains the preallocated extent was turned
into RO mode by a scrub running in parallel. The COW path then calls the
extent allocator which ends up calling btrfs_add_reserved_bytes(), and
this function decrements the bytes_may_use counter of the data space_info
object by an amount corresponding to the size of the allocated extent,
despite we haven't previously incremented it. When the counter currently
has a value smaller then the allocated extent we reset the counter to 0
and emit a warning, otherwise we just decrement it and slowly mess up
with this counter which is crucial for space reservation, the end result
can be granting reserved space to tasks when there isn't really enough
free space, and having the tasks fail later in critical places where
error handling consists of a transaction abort or hitting a BUG_ON().
Fix this by making sure that if we fallback to the COW path for a data
relocation inode, we increment the bytes_may_use counter of the data
space_info object. The COW path will then decrement it at
btrfs_add_reserved_bytes() on success or through its error handling part
by a call to extent_clear_unlock_delalloc() (which ends up calling
btrfs_clear_delalloc_extent() that does the decrement operation) in case
of an error.
Test case btrfs/061 from fstests could sporadically trigger this.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:33:05 +01:00
|
|
|
* space cache inode or an inode of the data relocation tree, we must
|
|
|
|
|
* also increment bytes_may_use of the data space_info for the same
|
|
|
|
|
* reason. Space caches and relocated data extents always get a prealloc
|
2020-05-27 11:16:19 +01:00
|
|
|
* extent for them, however scrub or balance may have set the block
|
btrfs: fix bytes_may_use underflow when running balance and scrub in parallel
When balance and scrub are running in parallel it is possible to end up
with an underflow of the bytes_may_use counter of the data space_info
object, which triggers a warning like the following:
[134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data
[134243.806891] ------------[ cut here ]------------
[134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.808819] Modules linked in: btrfs blake2b_generic xor (...)
[134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483)
[134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.819963] Code: 0b f2 85 (...)
[134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287
[134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000
[134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810
[134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000
[134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000
[134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810
[134243.827432] FS: 0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000
[134243.828451] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0
[134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134243.831867] Call Trace:
[134243.832211] find_free_extent+0x4a0/0x16c0 [btrfs]
[134243.832846] btrfs_reserve_extent+0x91/0x180 [btrfs]
[134243.833487] cow_file_range+0x12d/0x490 [btrfs]
[134243.834080] fallback_to_cow+0x82/0x1b0 [btrfs]
[134243.834689] ? release_extent_buffer+0x121/0x170 [btrfs]
[134243.835370] run_delalloc_nocow+0x33f/0xa30 [btrfs]
[134243.836032] btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
[134243.836725] ? find_lock_delalloc_range+0x221/0x250 [btrfs]
[134243.837450] writepage_delalloc+0xe8/0x150 [btrfs]
[134243.838059] __extent_writepage+0xe8/0x4c0 [btrfs]
[134243.838674] extent_write_cache_pages+0x237/0x530 [btrfs]
[134243.839364] extent_writepages+0x44/0xa0 [btrfs]
[134243.839946] do_writepages+0x23/0x80
[134243.840401] __writeback_single_inode+0x59/0x700
[134243.841006] writeback_sb_inodes+0x267/0x5f0
[134243.841548] __writeback_inodes_wb+0x87/0xe0
[134243.842091] wb_writeback+0x382/0x590
[134243.842574] ? wb_workfn+0x4a2/0x6c0
[134243.843030] wb_workfn+0x4a2/0x6c0
[134243.843468] process_one_work+0x26d/0x6a0
[134243.843978] worker_thread+0x4f/0x3e0
[134243.844452] ? process_one_work+0x6a0/0x6a0
[134243.844981] kthread+0x103/0x140
[134243.845400] ? kthread_create_worker_on_cpu+0x70/0x70
[134243.846030] ret_from_fork+0x3a/0x50
[134243.846494] irq event stamp: 0
[134243.846892] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134243.847682] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.848687] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134243.850698] ---[ end trace bd7c03622e0b0a96 ]---
[134243.851335] ------------[ cut here ]------------
When relocating a data block group, for each extent allocated in the
block group we preallocate another extent with the same size for the
data relocation inode (we do it at prealloc_file_extent_cluster()).
We reserve space by calling btrfs_check_data_free_space(), which ends
up incrementing the data space_info's bytes_may_use counter, and
then call btrfs_prealloc_file_range() to allocate the extent, which
always decrements the bytes_may_use counter by the same amount.
The expectation is that writeback of the data relocation inode always
follows a NOCOW path, by writing into the preallocated extents. However,
when starting writeback we might end up falling back into the COW path,
because the block group that contains the preallocated extent was turned
into RO mode by a scrub running in parallel. The COW path then calls the
extent allocator which ends up calling btrfs_add_reserved_bytes(), and
this function decrements the bytes_may_use counter of the data space_info
object by an amount corresponding to the size of the allocated extent,
despite we haven't previously incremented it. When the counter currently
has a value smaller then the allocated extent we reset the counter to 0
and emit a warning, otherwise we just decrement it and slowly mess up
with this counter which is crucial for space reservation, the end result
can be granting reserved space to tasks when there isn't really enough
free space, and having the tasks fail later in critical places where
error handling consists of a transaction abort or hitting a BUG_ON().
Fix this by making sure that if we fallback to the COW path for a data
relocation inode, we increment the bytes_may_use counter of the data
space_info object. The COW path will then decrement it at
btrfs_add_reserved_bytes() on success or through its error handling part
by a call to extent_clear_unlock_delalloc() (which ends up calling
btrfs_clear_delalloc_extent() that does the decrement operation) in case
of an error.
Test case btrfs/061 from fstests could sporadically trigger this.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:33:05 +01:00
|
|
|
* group that contains that extent to RO mode and therefore force COW
|
|
|
|
|
* when starting writeback.
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
*/
|
2020-05-27 11:16:19 +01:00
|
|
|
count = count_range_bits(io_tree, &range_start, end, range_bytes,
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
EXTENT_NORESERVE, 0);
|
btrfs: fix bytes_may_use underflow when running balance and scrub in parallel
When balance and scrub are running in parallel it is possible to end up
with an underflow of the bytes_may_use counter of the data space_info
object, which triggers a warning like the following:
[134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data
[134243.806891] ------------[ cut here ]------------
[134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.808819] Modules linked in: btrfs blake2b_generic xor (...)
[134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483)
[134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.819963] Code: 0b f2 85 (...)
[134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287
[134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000
[134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810
[134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000
[134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000
[134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810
[134243.827432] FS: 0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000
[134243.828451] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0
[134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134243.831867] Call Trace:
[134243.832211] find_free_extent+0x4a0/0x16c0 [btrfs]
[134243.832846] btrfs_reserve_extent+0x91/0x180 [btrfs]
[134243.833487] cow_file_range+0x12d/0x490 [btrfs]
[134243.834080] fallback_to_cow+0x82/0x1b0 [btrfs]
[134243.834689] ? release_extent_buffer+0x121/0x170 [btrfs]
[134243.835370] run_delalloc_nocow+0x33f/0xa30 [btrfs]
[134243.836032] btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
[134243.836725] ? find_lock_delalloc_range+0x221/0x250 [btrfs]
[134243.837450] writepage_delalloc+0xe8/0x150 [btrfs]
[134243.838059] __extent_writepage+0xe8/0x4c0 [btrfs]
[134243.838674] extent_write_cache_pages+0x237/0x530 [btrfs]
[134243.839364] extent_writepages+0x44/0xa0 [btrfs]
[134243.839946] do_writepages+0x23/0x80
[134243.840401] __writeback_single_inode+0x59/0x700
[134243.841006] writeback_sb_inodes+0x267/0x5f0
[134243.841548] __writeback_inodes_wb+0x87/0xe0
[134243.842091] wb_writeback+0x382/0x590
[134243.842574] ? wb_workfn+0x4a2/0x6c0
[134243.843030] wb_workfn+0x4a2/0x6c0
[134243.843468] process_one_work+0x26d/0x6a0
[134243.843978] worker_thread+0x4f/0x3e0
[134243.844452] ? process_one_work+0x6a0/0x6a0
[134243.844981] kthread+0x103/0x140
[134243.845400] ? kthread_create_worker_on_cpu+0x70/0x70
[134243.846030] ret_from_fork+0x3a/0x50
[134243.846494] irq event stamp: 0
[134243.846892] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134243.847682] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.848687] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134243.850698] ---[ end trace bd7c03622e0b0a96 ]---
[134243.851335] ------------[ cut here ]------------
When relocating a data block group, for each extent allocated in the
block group we preallocate another extent with the same size for the
data relocation inode (we do it at prealloc_file_extent_cluster()).
We reserve space by calling btrfs_check_data_free_space(), which ends
up incrementing the data space_info's bytes_may_use counter, and
then call btrfs_prealloc_file_range() to allocate the extent, which
always decrements the bytes_may_use counter by the same amount.
The expectation is that writeback of the data relocation inode always
follows a NOCOW path, by writing into the preallocated extents. However,
when starting writeback we might end up falling back into the COW path,
because the block group that contains the preallocated extent was turned
into RO mode by a scrub running in parallel. The COW path then calls the
extent allocator which ends up calling btrfs_add_reserved_bytes(), and
this function decrements the bytes_may_use counter of the data space_info
object by an amount corresponding to the size of the allocated extent,
despite we haven't previously incremented it. When the counter currently
has a value smaller then the allocated extent we reset the counter to 0
and emit a warning, otherwise we just decrement it and slowly mess up
with this counter which is crucial for space reservation, the end result
can be granting reserved space to tasks when there isn't really enough
free space, and having the tasks fail later in critical places where
error handling consists of a transaction abort or hitting a BUG_ON().
Fix this by making sure that if we fallback to the COW path for a data
relocation inode, we increment the bytes_may_use counter of the data
space_info object. The COW path will then decrement it at
btrfs_add_reserved_bytes() on success or through its error handling part
by a call to extent_clear_unlock_delalloc() (which ends up calling
btrfs_clear_delalloc_extent() that does the decrement operation) in case
of an error.
Test case btrfs/061 from fstests could sporadically trigger this.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:33:05 +01:00
|
|
|
if (count > 0 || is_space_ino || is_reloc_ino) {
|
|
|
|
|
u64 bytes = count;
|
2020-06-03 08:55:20 +03:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
|
|
|
|
|
|
btrfs: fix bytes_may_use underflow when running balance and scrub in parallel
When balance and scrub are running in parallel it is possible to end up
with an underflow of the bytes_may_use counter of the data space_info
object, which triggers a warning like the following:
[134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data
[134243.806891] ------------[ cut here ]------------
[134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.808819] Modules linked in: btrfs blake2b_generic xor (...)
[134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5
[134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483)
[134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
[134243.819963] Code: 0b f2 85 (...)
[134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287
[134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000
[134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810
[134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000
[134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000
[134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810
[134243.827432] FS: 0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000
[134243.828451] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0
[134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[134243.831867] Call Trace:
[134243.832211] find_free_extent+0x4a0/0x16c0 [btrfs]
[134243.832846] btrfs_reserve_extent+0x91/0x180 [btrfs]
[134243.833487] cow_file_range+0x12d/0x490 [btrfs]
[134243.834080] fallback_to_cow+0x82/0x1b0 [btrfs]
[134243.834689] ? release_extent_buffer+0x121/0x170 [btrfs]
[134243.835370] run_delalloc_nocow+0x33f/0xa30 [btrfs]
[134243.836032] btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
[134243.836725] ? find_lock_delalloc_range+0x221/0x250 [btrfs]
[134243.837450] writepage_delalloc+0xe8/0x150 [btrfs]
[134243.838059] __extent_writepage+0xe8/0x4c0 [btrfs]
[134243.838674] extent_write_cache_pages+0x237/0x530 [btrfs]
[134243.839364] extent_writepages+0x44/0xa0 [btrfs]
[134243.839946] do_writepages+0x23/0x80
[134243.840401] __writeback_single_inode+0x59/0x700
[134243.841006] writeback_sb_inodes+0x267/0x5f0
[134243.841548] __writeback_inodes_wb+0x87/0xe0
[134243.842091] wb_writeback+0x382/0x590
[134243.842574] ? wb_workfn+0x4a2/0x6c0
[134243.843030] wb_workfn+0x4a2/0x6c0
[134243.843468] process_one_work+0x26d/0x6a0
[134243.843978] worker_thread+0x4f/0x3e0
[134243.844452] ? process_one_work+0x6a0/0x6a0
[134243.844981] kthread+0x103/0x140
[134243.845400] ? kthread_create_worker_on_cpu+0x70/0x70
[134243.846030] ret_from_fork+0x3a/0x50
[134243.846494] irq event stamp: 0
[134243.846892] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[134243.847682] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.848687] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
[134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0
[134243.850698] ---[ end trace bd7c03622e0b0a96 ]---
[134243.851335] ------------[ cut here ]------------
When relocating a data block group, for each extent allocated in the
block group we preallocate another extent with the same size for the
data relocation inode (we do it at prealloc_file_extent_cluster()).
We reserve space by calling btrfs_check_data_free_space(), which ends
up incrementing the data space_info's bytes_may_use counter, and
then call btrfs_prealloc_file_range() to allocate the extent, which
always decrements the bytes_may_use counter by the same amount.
The expectation is that writeback of the data relocation inode always
follows a NOCOW path, by writing into the preallocated extents. However,
when starting writeback we might end up falling back into the COW path,
because the block group that contains the preallocated extent was turned
into RO mode by a scrub running in parallel. The COW path then calls the
extent allocator which ends up calling btrfs_add_reserved_bytes(), and
this function decrements the bytes_may_use counter of the data space_info
object by an amount corresponding to the size of the allocated extent,
despite we haven't previously incremented it. When the counter currently
has a value smaller then the allocated extent we reset the counter to 0
and emit a warning, otherwise we just decrement it and slowly mess up
with this counter which is crucial for space reservation, the end result
can be granting reserved space to tasks when there isn't really enough
free space, and having the tasks fail later in critical places where
error handling consists of a transaction abort or hitting a BUG_ON().
Fix this by making sure that if we fallback to the COW path for a data
relocation inode, we increment the bytes_may_use counter of the data
space_info object. The COW path will then decrement it at
btrfs_add_reserved_bytes() on success or through its error handling part
by a call to extent_clear_unlock_delalloc() (which ends up calling
btrfs_clear_delalloc_extent() that does the decrement operation) in case
of an error.
Test case btrfs/061 from fstests could sporadically trigger this.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-08 13:33:05 +01:00
|
|
|
if (is_space_ino || is_reloc_ino)
|
|
|
|
|
bytes = range_bytes;
|
|
|
|
|
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
spin_lock(&sinfo->lock);
|
2020-05-27 11:16:19 +01:00
|
|
|
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
spin_unlock(&sinfo->lock);
|
|
|
|
|
|
2020-05-27 11:16:19 +01:00
|
|
|
if (count > 0)
|
|
|
|
|
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
|
|
|
|
|
0, 0, NULL);
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:20 +03:00
|
|
|
return cow_file_range(inode, locked_page, start, end, page_started,
|
2022-07-09 08:18:49 +09:00
|
|
|
nr_written, 1, NULL);
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
}
|
|
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
struct can_nocow_file_extent_args {
|
|
|
|
|
/* Input fields. */
|
|
|
|
|
|
|
|
|
|
/* Start file offset of the range we want to NOCOW. */
|
|
|
|
|
u64 start;
|
|
|
|
|
/* End file offset (inclusive) of the range we want to NOCOW. */
|
|
|
|
|
u64 end;
|
|
|
|
|
bool writeback_path;
|
|
|
|
|
bool strict;
|
|
|
|
|
/*
|
|
|
|
|
* Free the path passed to can_nocow_file_extent() once it's not needed
|
|
|
|
|
* anymore.
|
|
|
|
|
*/
|
|
|
|
|
bool free_path;
|
|
|
|
|
|
|
|
|
|
/* Output fields. Only set when can_nocow_file_extent() returns 1. */
|
|
|
|
|
|
|
|
|
|
u64 disk_bytenr;
|
|
|
|
|
u64 disk_num_bytes;
|
|
|
|
|
u64 extent_offset;
|
|
|
|
|
/* Number of bytes that can be written to in NOCOW mode. */
|
|
|
|
|
u64 num_bytes;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check if we can NOCOW the file extent that the path points to.
|
|
|
|
|
* This function may return with the path released, so the caller should check
|
|
|
|
|
* if path->nodes[0] is NULL or not if it needs to use the path afterwards.
|
|
|
|
|
*
|
|
|
|
|
* Returns: < 0 on error
|
|
|
|
|
* 0 if we can not NOCOW
|
|
|
|
|
* 1 if we can NOCOW
|
|
|
|
|
*/
|
|
|
|
|
static int can_nocow_file_extent(struct btrfs_path *path,
|
|
|
|
|
struct btrfs_key *key,
|
|
|
|
|
struct btrfs_inode *inode,
|
|
|
|
|
struct can_nocow_file_extent_args *args)
|
|
|
|
|
{
|
|
|
|
|
const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
|
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
|
u64 extent_end;
|
|
|
|
|
u8 extent_type;
|
|
|
|
|
int can_nocow = 0;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
|
|
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
|
|
|
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* Can't access these fields unless we know it's not an inline extent. */
|
|
|
|
|
args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
|
|
|
|
args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
|
|
|
|
|
args->extent_offset = btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
|
|
|
|
|
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
|
extent_type == BTRFS_FILE_EXTENT_REG)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the extent was created before the generation where the last snapshot
|
|
|
|
|
* for its subvolume was created, then this implies the extent is shared,
|
|
|
|
|
* hence we must COW.
|
|
|
|
|
*/
|
2022-03-30 15:31:07 +01:00
|
|
|
if (!args->strict &&
|
2022-03-30 15:31:06 +01:00
|
|
|
btrfs_file_extent_generation(leaf, fi) <=
|
|
|
|
|
btrfs_root_last_snapshot(&root->root_item))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* An explicit hole, must COW. */
|
|
|
|
|
if (args->disk_bytenr == 0)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* Compressed/encrypted/encoded extents must be COWed. */
|
|
|
|
|
if (btrfs_file_extent_compression(leaf, fi) ||
|
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) ||
|
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
extent_end = btrfs_file_extent_end(path);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The following checks can be expensive, as they need to take other
|
|
|
|
|
* locks and do btree or rbtree searches, so release the path to avoid
|
|
|
|
|
* blocking other tasks for too long.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
|
|
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
|
|
|
|
|
key->offset - args->extent_offset,
|
|
|
|
|
args->disk_bytenr, false, path);
|
|
|
|
|
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
|
|
|
|
|
if (ret != 0)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
if (args->free_path) {
|
|
|
|
|
/*
|
|
|
|
|
* We don't need the path anymore, plus through the
|
|
|
|
|
* csum_exist_in_range() call below we will end up allocating
|
|
|
|
|
* another path. So free the path to avoid unnecessary extra
|
|
|
|
|
* memory usage.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
path = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If there are pending snapshots for this root, we must COW. */
|
|
|
|
|
if (args->writeback_path && !is_freespace_inode &&
|
|
|
|
|
atomic_read(&root->snapshot_force_cow))
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
args->disk_bytenr += args->extent_offset;
|
|
|
|
|
args->disk_bytenr += args->start - key->offset;
|
|
|
|
|
args->num_bytes = min(args->end + 1, extent_end) - args->start;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Force COW if csums exist in the range. This ensures that csums for a
|
|
|
|
|
* given extent are either valid or do not exist.
|
|
|
|
|
*/
|
|
|
|
|
ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
|
|
|
|
|
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
|
|
|
|
|
if (ret != 0)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
can_nocow = 1;
|
|
|
|
|
out:
|
|
|
|
|
if (args->free_path && path)
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
|
|
return ret < 0 ? ret : can_nocow;
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* when nowcow writeback call back. This checks for snapshots or COW copies
|
|
|
|
|
* of the extents that exist in the file, and COWs the file as required.
|
|
|
|
|
*
|
|
|
|
|
* If no cow copies or snapshots exist, we write directly to the existing
|
|
|
|
|
* blocks on disk
|
|
|
|
|
*/
|
2020-06-03 08:55:21 +03:00
|
|
|
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
2009-03-12 20:12:45 -04:00
|
|
|
struct page *locked_page,
|
2019-08-21 10:42:03 +03:00
|
|
|
const u64 start, const u64 end,
|
2021-03-04 09:06:25 -06:00
|
|
|
int *page_started,
|
2019-08-21 10:42:03 +03:00
|
|
|
unsigned long *nr_written)
|
2007-12-17 20:14:01 -05:00
|
|
|
{
|
2020-06-03 08:55:21 +03:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
2007-12-17 20:14:01 -05:00
|
|
|
struct btrfs_path *path;
|
2019-08-21 10:42:03 +03:00
|
|
|
u64 cow_start = (u64)-1;
|
|
|
|
|
u64 cur_offset = start;
|
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced
nocow writes to fallback to COW, during writeback, when a snapshot is
created. This resulted in writes made before creating the snapshot to
unexpectedly fail with ENOSPC during writeback when success (0) was
returned to user space through the write system call.
The steps leading to this problem are:
1. When it's not possible to allocate data space for a write, the
buffered write path checks if a NOCOW write is possible. If it is,
it will not reserve space and success (0) is returned to user space.
2. Then when a snapshot is created, the root's will_be_snapshotted
atomic is incremented and writeback is triggered for all inode's that
belong to the root being snapshotted. Incrementing that atomic forces
all previous writes to fallback to COW during writeback (running
delalloc).
3. This results in the writeback for the inodes to fail and therefore
setting the ENOSPC error in their mappings, so that a subsequent
fsync on them will report the error to user space. So it's not a
completely silent data loss (since fsync will report ENOSPC) but it's
a very unexpected and undesirable behaviour, because if a clean
shutdown/unmount of the filesystem happens without previous calls to
fsync, it is expected to have the data present in the files after
mounting the filesystem again.
So fix this by adding a new atomic named snapshot_force_cow to the
root structure which prevents this behaviour and works the following way:
1. It is incremented when we start to create a snapshot after triggering
writeback and before waiting for writeback to finish.
2. This new atomic is now what is used by writeback (running delalloc)
to decide whether we need to fallback to COW or not. Because we
incremented this new atomic after triggering writeback in the
snapshot creation ioctl, we ensure that all buffered writes that
happened before snapshot creation will succeed and not fallback to
COW (which would make them fail with ENOSPC).
3. The existing atomic, will_be_snapshotted, is kept because it is used
to force new buffered writes, that start after we started
snapshotting, to reserve data space even when NOCOW is possible.
This makes these writes fail early with ENOSPC when there's no
available space to allocate, preventing the unexpected behaviour of
writeback later failing with ENOSPC due to a fallback to COW mode.
Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 10:30:30 +08:00
|
|
|
int ret;
|
2019-08-21 10:42:03 +03:00
|
|
|
bool check_prev = true;
|
2020-06-03 08:55:21 +03:00
|
|
|
u64 ino = btrfs_ino(inode);
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
struct btrfs_block_group *bg;
|
2019-08-22 17:24:20 +03:00
|
|
|
bool nocow = false;
|
2022-03-30 15:31:06 +01:00
|
|
|
struct can_nocow_file_extent_args nocow_args = { 0 };
|
2007-12-17 20:14:01 -05:00
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2012-05-31 15:58:55 -04:00
|
|
|
if (!path) {
|
2020-06-03 08:55:21 +03:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, locked_page,
|
2013-07-29 11:20:47 -04:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2013-07-29 13:22:24 -04:00
|
|
|
EXTENT_DO_ACCOUNTING |
|
|
|
|
|
EXTENT_DEFRAG, PAGE_UNLOCK |
|
2021-01-26 16:33:45 +08:00
|
|
|
PAGE_START_WRITEBACK |
|
2013-07-29 11:20:47 -04:00
|
|
|
PAGE_END_WRITEBACK);
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 10:38:47 -07:00
|
|
|
return -ENOMEM;
|
2012-05-31 15:58:55 -04:00
|
|
|
}
|
2011-04-20 10:33:24 +08:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_args.end = end;
|
|
|
|
|
nocow_args.writeback_path = true;
|
|
|
|
|
|
2008-10-30 14:20:02 -04:00
|
|
|
while (1) {
|
2019-08-21 10:42:03 +03:00
|
|
|
struct btrfs_key found_key;
|
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
u64 extent_end;
|
|
|
|
|
u64 ram_bytes;
|
2022-03-30 15:31:06 +01:00
|
|
|
u64 nocow_end;
|
2019-08-21 10:42:03 +03:00
|
|
|
int extent_type;
|
2019-08-22 17:24:20 +03:00
|
|
|
|
|
|
|
|
nocow = false;
|
2019-08-21 10:42:03 +03:00
|
|
|
|
2017-01-30 12:25:28 -08:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
|
2008-10-30 14:20:02 -04:00
|
|
|
cur_offset, 0);
|
2013-10-25 16:55:08 -04:00
|
|
|
if (ret < 0)
|
2012-03-12 16:03:00 +01:00
|
|
|
goto error;
|
2019-08-21 10:42:57 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If there is no extent for our range when doing the initial
|
|
|
|
|
* search, then go back to the previous slot as it will be the
|
|
|
|
|
* one containing the search offset
|
|
|
|
|
*/
|
2008-10-30 14:20:02 -04:00
|
|
|
if (ret > 0 && path->slots[0] > 0 && check_prev) {
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key,
|
|
|
|
|
path->slots[0] - 1);
|
2011-04-20 10:31:50 +08:00
|
|
|
if (found_key.objectid == ino &&
|
2008-10-30 14:20:02 -04:00
|
|
|
found_key.type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
}
|
2019-08-21 10:42:03 +03:00
|
|
|
check_prev = false;
|
2008-10-30 14:20:02 -04:00
|
|
|
next_slot:
|
2019-08-21 10:42:57 +03:00
|
|
|
/* Go to next leaf if we have exhausted the current one */
|
2008-10-30 14:20:02 -04:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
2018-01-25 11:02:50 -07:00
|
|
|
if (ret < 0) {
|
|
|
|
|
if (cow_start != (u64)-1)
|
|
|
|
|
cur_offset = cow_start;
|
2012-03-12 16:03:00 +01:00
|
|
|
goto error;
|
2018-01-25 11:02:50 -07:00
|
|
|
}
|
2008-10-30 14:20:02 -04:00
|
|
|
if (ret > 0)
|
|
|
|
|
break;
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
}
|
2007-12-17 20:14:01 -05:00
|
|
|
|
2008-10-30 14:20:02 -04:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
2019-08-21 10:42:57 +03:00
|
|
|
/* Didn't find anything for our INO */
|
Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
If we are using the NO_HOLES feature, we have a tiny time window when
running delalloc for a nodatacow inode where we can race with a concurrent
link or xattr add operation leading to a BUG_ON.
This happens because at run_delalloc_nocow() we end up casting a leaf item
of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
file extent item (struct btrfs_file_extent_item) and then analyse its
extent type field, which won't match any of the expected extent types
(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
explicit BUG_ON(1).
The following sequence diagram shows how the race happens when running a
no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
neighbour leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
slot N - 2 slot N - 1 slot 0
(Note the implicit hole for inode 257 regarding the [0, 8K[ range)
CPU 1 CPU 2
run_dealloc_nocow()
btrfs_lookup_file_extent()
--> searches for a key with value
(257 EXTENT_DATA 4096) in the
fs/subvol tree
--> returns us a path with
path->nodes[0] == leaf X and
path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it
calls btrfs_next_leaf()
btrfs_next_leaf()
--> releases the path
hard link added to our inode,
with key (257 INODE_REF 500)
added to the end of leaf X,
so leaf X now has N + 1 keys
--> searches for the key
(257 INODE_REF 256), because
it was the last key in leaf X
before it released the path,
with path->keep_locks set to 1
--> ends up at leaf X again and
it verifies that the key
(257 INODE_REF 256) is no longer
the last key in the leaf, so it
returns with path->nodes[0] ==
leaf X and path->slots[0] == N,
pointing to the new item with
key (257 INODE_REF 500)
the loop iteration of run_dealloc_nocow()
does not break out the loop and continues
because the key referenced in the path
at path->nodes[0] and path->slots[0] is
for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
and its offset (500) is less then our delalloc
range's end (8192)
the item pointed by the path, an inode reference item,
is (incorrectly) interpreted as a file extent item and
we get an invalid extent type, leading to the BUG_ON(1):
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
(...)
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
(...)
} else {
BUG_ON(1)
}
The same can happen if a xattr is added concurrently and ends up having
a key with an offset smaller then the delalloc's range end.
So fix this by skipping keys with a type smaller than
BTRFS_EXTENT_DATA_KEY.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 00:33:58 +00:00
|
|
|
if (found_key.objectid > ino)
|
|
|
|
|
break;
|
2019-08-21 10:42:57 +03:00
|
|
|
/*
|
|
|
|
|
* Keep searching until we find an EXTENT_ITEM or there are no
|
|
|
|
|
* more extents for this inode
|
|
|
|
|
*/
|
Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
If we are using the NO_HOLES feature, we have a tiny time window when
running delalloc for a nodatacow inode where we can race with a concurrent
link or xattr add operation leading to a BUG_ON.
This happens because at run_delalloc_nocow() we end up casting a leaf item
of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
file extent item (struct btrfs_file_extent_item) and then analyse its
extent type field, which won't match any of the expected extent types
(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
explicit BUG_ON(1).
The following sequence diagram shows how the race happens when running a
no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
neighbour leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
slot N - 2 slot N - 1 slot 0
(Note the implicit hole for inode 257 regarding the [0, 8K[ range)
CPU 1 CPU 2
run_dealloc_nocow()
btrfs_lookup_file_extent()
--> searches for a key with value
(257 EXTENT_DATA 4096) in the
fs/subvol tree
--> returns us a path with
path->nodes[0] == leaf X and
path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it
calls btrfs_next_leaf()
btrfs_next_leaf()
--> releases the path
hard link added to our inode,
with key (257 INODE_REF 500)
added to the end of leaf X,
so leaf X now has N + 1 keys
--> searches for the key
(257 INODE_REF 256), because
it was the last key in leaf X
before it released the path,
with path->keep_locks set to 1
--> ends up at leaf X again and
it verifies that the key
(257 INODE_REF 256) is no longer
the last key in the leaf, so it
returns with path->nodes[0] ==
leaf X and path->slots[0] == N,
pointing to the new item with
key (257 INODE_REF 500)
the loop iteration of run_dealloc_nocow()
does not break out the loop and continues
because the key referenced in the path
at path->nodes[0] and path->slots[0] is
for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
and its offset (500) is less then our delalloc
range's end (8192)
the item pointed by the path, an inode reference item,
is (incorrectly) interpreted as a file extent item and
we get an invalid extent type, leading to the BUG_ON(1):
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
(...)
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
(...)
} else {
BUG_ON(1)
}
The same can happen if a xattr is added concurrently and ends up having
a key with an offset smaller then the delalloc's range end.
So fix this by skipping keys with a type smaller than
BTRFS_EXTENT_DATA_KEY.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 00:33:58 +00:00
|
|
|
if (WARN_ON_ONCE(found_key.objectid < ino) ||
|
|
|
|
|
found_key.type < BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
|
path->slots[0]++;
|
|
|
|
|
goto next_slot;
|
|
|
|
|
}
|
2019-08-21 10:42:57 +03:00
|
|
|
|
|
|
|
|
/* Found key is not EXTENT_DATA_KEY or starts after req range */
|
Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
If we are using the NO_HOLES feature, we have a tiny time window when
running delalloc for a nodatacow inode where we can race with a concurrent
link or xattr add operation leading to a BUG_ON.
This happens because at run_delalloc_nocow() we end up casting a leaf item
of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
file extent item (struct btrfs_file_extent_item) and then analyse its
extent type field, which won't match any of the expected extent types
(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
explicit BUG_ON(1).
The following sequence diagram shows how the race happens when running a
no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
neighbour leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
slot N - 2 slot N - 1 slot 0
(Note the implicit hole for inode 257 regarding the [0, 8K[ range)
CPU 1 CPU 2
run_dealloc_nocow()
btrfs_lookup_file_extent()
--> searches for a key with value
(257 EXTENT_DATA 4096) in the
fs/subvol tree
--> returns us a path with
path->nodes[0] == leaf X and
path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it
calls btrfs_next_leaf()
btrfs_next_leaf()
--> releases the path
hard link added to our inode,
with key (257 INODE_REF 500)
added to the end of leaf X,
so leaf X now has N + 1 keys
--> searches for the key
(257 INODE_REF 256), because
it was the last key in leaf X
before it released the path,
with path->keep_locks set to 1
--> ends up at leaf X again and
it verifies that the key
(257 INODE_REF 256) is no longer
the last key in the leaf, so it
returns with path->nodes[0] ==
leaf X and path->slots[0] == N,
pointing to the new item with
key (257 INODE_REF 500)
the loop iteration of run_dealloc_nocow()
does not break out the loop and continues
because the key referenced in the path
at path->nodes[0] and path->slots[0] is
for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
and its offset (500) is less then our delalloc
range's end (8192)
the item pointed by the path, an inode reference item,
is (incorrectly) interpreted as a file extent item and
we get an invalid extent type, leading to the BUG_ON(1):
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
(...)
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
(...)
} else {
BUG_ON(1)
}
The same can happen if a xattr is added concurrently and ends up having
a key with an offset smaller then the delalloc's range end.
So fix this by skipping keys with a type smaller than
BTRFS_EXTENT_DATA_KEY.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 00:33:58 +00:00
|
|
|
if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
|
2008-10-30 14:20:02 -04:00
|
|
|
found_key.offset > end)
|
|
|
|
|
break;
|
|
|
|
|
|
2019-08-21 10:42:57 +03:00
|
|
|
/*
|
|
|
|
|
* If the found extent starts after requested offset, then
|
|
|
|
|
* adjust extent_end to be right before this extent begins
|
|
|
|
|
*/
|
2008-10-30 14:20:02 -04:00
|
|
|
if (found_key.offset > cur_offset) {
|
|
|
|
|
extent_end = found_key.offset;
|
2009-10-09 09:57:45 -04:00
|
|
|
extent_type = 0;
|
2008-10-30 14:20:02 -04:00
|
|
|
goto out_check;
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-21 10:42:57 +03:00
|
|
|
/*
|
|
|
|
|
* Found extent which begins before our range and potentially
|
|
|
|
|
* intersect it
|
|
|
|
|
*/
|
2008-10-30 14:20:02 -04:00
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
2022-03-30 15:31:06 +01:00
|
|
|
/* If this is triggered then we have a memory corruption. */
|
|
|
|
|
ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
|
|
|
|
|
if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
|
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
2013-04-04 14:31:27 -04:00
|
|
|
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
|
2022-03-30 15:31:06 +01:00
|
|
|
extent_end = btrfs_file_extent_end(path);
|
btrfs: unlock path before checking if extent is shared during nocow writeback
When we are attempting to start writeback for an existing extent in NOCOW
mode, at run_delalloc_nocow(), we must check if the extent is shared, and
if it is, fallback to a COW write. However we do such check while still
holding a read lock on the leaf that contains the file extent item, and
that check, the call to btrfs_cross_ref_exist(), can take some time
because:
1) It needs to do a search on the extent tree, which obviously takes some
time, specially if delayed references are being run at the moment, as
we can block when trying to lock currently write locked btree nodes;
2) It needs to check the delayed references for any existing reference
for our data extent, this requires acquiring the delayed references'
spinlock and maybe block on the mutex of a delayed reference head in the
case where there is a delayed reference for our data extent, in the
worst case it makes us release the path on the extent tree and retry
the whole process again (going back to step 1).
There are other operations we do while holding the leaf locked that can
take some significant time as well (specially all together):
* btrfs_extent_readonly() - to check if the block group containing the
extent is currently in RO mode. This requires taking a spinlock and
searching for the block group in a rbtree that can be big on large
filesystems;
* csum_exist_in_range() - to search if there are any checksums in the
csum tree for the extent. Like before, this can take some time if we are
in a filesystem that has both COW and NOCOW files, in which case the
csum tree is not empty;
* btrfs_inc_nocow_writers() - increment the number of nocow writers in the
block group that contains the data extent. Needs to acquire a spinlock
and search for the block group in a rbtree that can be big on large
filesystems.
So just unlock the leaf (release the path) before doing all those checks,
since we do not need it anymore. In case we can not do a NOCOW write for
the extent, due to any of those checks failing, and the writeback range
goes beyond that extents' length, we will do another btree search for the
next file extent item.
The following script that calls dbench was used to measure the impact of
this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device
directly (no intermediary filesystem on the host) and using a non-debug
kernel (default configuration on Debian):
$ cat test-dbench.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-m single -d single"
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -D $MNT -t 300 64
umount $MNT
Before this change:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 9326331 0.317 399.957
Close 6851198 0.002 6.402
Rename 394894 2.621 402.819
Unlink 1883131 0.931 398.082
Deltree 256 19.160 303.580
Mkdir 128 0.003 0.016
Qpathinfo 8452314 0.068 116.133
Qfileinfo 1481921 0.001 5.081
Qfsinfo 1549963 0.002 4.444
Sfileinfo 759679 0.084 17.079
Find 3268168 0.396 118.196
WriteX 4653310 0.056 110.993
ReadX 14618818 0.005 23.314
LockX 30364 0.003 0.497
UnlockX 30364 0.002 1.720
Flush 653619 16.954 569.299
Throughput 966.651 MB/sec 64 clients 64 procs max_latency=569.377 ms
After this change:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 9710433 0.302 232.449
Close 7132948 0.002 11.496
Rename 411144 2.452 131.805
Unlink 1960961 0.893 230.383
Deltree 256 14.858 198.646
Mkdir 128 0.002 0.005
Qpathinfo 8800890 0.066 111.588
Qfileinfo 1542556 0.001 3.852
Qfsinfo 1613835 0.002 5.483
Sfileinfo 790871 0.081 19.492
Find 3402743 0.386 120.185
WriteX 4842918 0.054 179.312
ReadX 15220407 0.005 32.435
LockX 31612 0.003 1.533
UnlockX 31612 0.002 1.047
Flush 680567 16.320 463.323
Throughput 1016.59 MB/sec 64 clients 64 procs max_latency=463.327 ms
+5.0% throughput, -20.5% max latency
Also, the following test using fio was run:
$ cat test-fio.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 4 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
BLOCK_SIZE=$4
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=randwrite
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=$BLOCK_SIZE
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo
echo "Using fio config:"
echo
cat /tmp/fio-job.ini
echo
echo "mount options: $MOUNT_OPTIONS"
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
echo "Creating nodatacow files before fio runs..."
for ((i = 0; i < $NUM_JOBS; i++)); do
xfs_io -f -c "pwrite -b 128M 0 $FILE_SIZE" "$MNT/writers.$i.0"
done
sync
fio /tmp/fio-job.ini
umount $MNT
Before this change:
$ ./test-fio.sh 16 512M 2 4K
(...)
WRITE: bw=28.3MiB/s (29.6MB/s), 28.3MiB/s-28.3MiB/s (29.6MB/s-29.6MB/s), io=8192MiB (8590MB), run=289800-289800msec
After this change:
$ ./test-fio.sh 16 512M 2 4K
(...)
WRITE: bw=31.2MiB/s (32.7MB/s), 31.2MiB/s-31.2MiB/s (32.7MB/s-32.7MB/s), io=8192MiB (8590MB), run=262845-262845msec
+9.7% throughput, -9.8% runtime
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-18 11:00:17 +00:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
/*
|
|
|
|
|
* If the extent we got ends before our current offset, skip to
|
|
|
|
|
* the next extent.
|
|
|
|
|
*/
|
|
|
|
|
if (extent_end <= cur_offset) {
|
|
|
|
|
path->slots[0]++;
|
|
|
|
|
goto next_slot;
|
|
|
|
|
}
|
btrfs: unlock path before checking if extent is shared during nocow writeback
When we are attempting to start writeback for an existing extent in NOCOW
mode, at run_delalloc_nocow(), we must check if the extent is shared, and
if it is, fallback to a COW write. However we do such check while still
holding a read lock on the leaf that contains the file extent item, and
that check, the call to btrfs_cross_ref_exist(), can take some time
because:
1) It needs to do a search on the extent tree, which obviously takes some
time, specially if delayed references are being run at the moment, as
we can block when trying to lock currently write locked btree nodes;
2) It needs to check the delayed references for any existing reference
for our data extent, this requires acquiring the delayed references'
spinlock and maybe block on the mutex of a delayed reference head in the
case where there is a delayed reference for our data extent, in the
worst case it makes us release the path on the extent tree and retry
the whole process again (going back to step 1).
There are other operations we do while holding the leaf locked that can
take some significant time as well (specially all together):
* btrfs_extent_readonly() - to check if the block group containing the
extent is currently in RO mode. This requires taking a spinlock and
searching for the block group in a rbtree that can be big on large
filesystems;
* csum_exist_in_range() - to search if there are any checksums in the
csum tree for the extent. Like before, this can take some time if we are
in a filesystem that has both COW and NOCOW files, in which case the
csum tree is not empty;
* btrfs_inc_nocow_writers() - increment the number of nocow writers in the
block group that contains the data extent. Needs to acquire a spinlock
and search for the block group in a rbtree that can be big on large
filesystems.
So just unlock the leaf (release the path) before doing all those checks,
since we do not need it anymore. In case we can not do a NOCOW write for
the extent, due to any of those checks failing, and the writeback range
goes beyond that extents' length, we will do another btree search for the
next file extent item.
The following script that calls dbench was used to measure the impact of
this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device
directly (no intermediary filesystem on the host) and using a non-debug
kernel (default configuration on Debian):
$ cat test-dbench.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-m single -d single"
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -D $MNT -t 300 64
umount $MNT
Before this change:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 9326331 0.317 399.957
Close 6851198 0.002 6.402
Rename 394894 2.621 402.819
Unlink 1883131 0.931 398.082
Deltree 256 19.160 303.580
Mkdir 128 0.003 0.016
Qpathinfo 8452314 0.068 116.133
Qfileinfo 1481921 0.001 5.081
Qfsinfo 1549963 0.002 4.444
Sfileinfo 759679 0.084 17.079
Find 3268168 0.396 118.196
WriteX 4653310 0.056 110.993
ReadX 14618818 0.005 23.314
LockX 30364 0.003 0.497
UnlockX 30364 0.002 1.720
Flush 653619 16.954 569.299
Throughput 966.651 MB/sec 64 clients 64 procs max_latency=569.377 ms
After this change:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 9710433 0.302 232.449
Close 7132948 0.002 11.496
Rename 411144 2.452 131.805
Unlink 1960961 0.893 230.383
Deltree 256 14.858 198.646
Mkdir 128 0.002 0.005
Qpathinfo 8800890 0.066 111.588
Qfileinfo 1542556 0.001 3.852
Qfsinfo 1613835 0.002 5.483
Sfileinfo 790871 0.081 19.492
Find 3402743 0.386 120.185
WriteX 4842918 0.054 179.312
ReadX 15220407 0.005 32.435
LockX 31612 0.003 1.533
UnlockX 31612 0.002 1.047
Flush 680567 16.320 463.323
Throughput 1016.59 MB/sec 64 clients 64 procs max_latency=463.327 ms
+5.0% throughput, -20.5% max latency
Also, the following test using fio was run:
$ cat test-fio.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 4 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
BLOCK_SIZE=$4
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=randwrite
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=$BLOCK_SIZE
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo
echo "Using fio config:"
echo
cat /tmp/fio-job.ini
echo
echo "mount options: $MOUNT_OPTIONS"
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
echo "Creating nodatacow files before fio runs..."
for ((i = 0; i < $NUM_JOBS; i++)); do
xfs_io -f -c "pwrite -b 128M 0 $FILE_SIZE" "$MNT/writers.$i.0"
done
sync
fio /tmp/fio-job.ini
umount $MNT
Before this change:
$ ./test-fio.sh 16 512M 2 4K
(...)
WRITE: bw=28.3MiB/s (29.6MB/s), 28.3MiB/s-28.3MiB/s (29.6MB/s-29.6MB/s), io=8192MiB (8590MB), run=289800-289800msec
After this change:
$ ./test-fio.sh 16 512M 2 4K
(...)
WRITE: bw=31.2MiB/s (32.7MB/s), 31.2MiB/s-31.2MiB/s (32.7MB/s-32.7MB/s), io=8192MiB (8590MB), run=262845-262845msec
+9.7% throughput, -9.8% runtime
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-18 11:00:17 +00:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_args.start = cur_offset;
|
|
|
|
|
ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
if (cow_start != (u64)-1)
|
|
|
|
|
cur_offset = cow_start;
|
|
|
|
|
goto error;
|
|
|
|
|
} else if (ret == 0) {
|
|
|
|
|
goto out_check;
|
|
|
|
|
}
|
2018-01-31 17:09:13 -07:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
ret = 0;
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
|
|
|
|
|
if (bg)
|
2019-08-21 10:42:03 +03:00
|
|
|
nocow = true;
|
2008-10-30 14:20:02 -04:00
|
|
|
out_check:
|
2019-08-21 10:42:57 +03:00
|
|
|
/*
|
|
|
|
|
* If nocow is false then record the beginning of the range
|
|
|
|
|
* that needs to be COWed
|
|
|
|
|
*/
|
2008-10-30 14:20:02 -04:00
|
|
|
if (!nocow) {
|
|
|
|
|
if (cow_start == (u64)-1)
|
|
|
|
|
cow_start = cur_offset;
|
|
|
|
|
cur_offset = extent_end;
|
|
|
|
|
if (cur_offset > end)
|
|
|
|
|
break;
|
btrfs: unlock path before checking if extent is shared during nocow writeback
When we are attempting to start writeback for an existing extent in NOCOW
mode, at run_delalloc_nocow(), we must check if the extent is shared, and
if it is, fallback to a COW write. However we do such check while still
holding a read lock on the leaf that contains the file extent item, and
that check, the call to btrfs_cross_ref_exist(), can take some time
because:
1) It needs to do a search on the extent tree, which obviously takes some
time, specially if delayed references are being run at the moment, as
we can block when trying to lock currently write locked btree nodes;
2) It needs to check the delayed references for any existing reference
for our data extent, this requires acquiring the delayed references'
spinlock and maybe block on the mutex of a delayed reference head in the
case where there is a delayed reference for our data extent, in the
worst case it makes us release the path on the extent tree and retry
the whole process again (going back to step 1).
There are other operations we do while holding the leaf locked that can
take some significant time as well (specially all together):
* btrfs_extent_readonly() - to check if the block group containing the
extent is currently in RO mode. This requires taking a spinlock and
searching for the block group in a rbtree that can be big on large
filesystems;
* csum_exist_in_range() - to search if there are any checksums in the
csum tree for the extent. Like before, this can take some time if we are
in a filesystem that has both COW and NOCOW files, in which case the
csum tree is not empty;
* btrfs_inc_nocow_writers() - increment the number of nocow writers in the
block group that contains the data extent. Needs to acquire a spinlock
and search for the block group in a rbtree that can be big on large
filesystems.
So just unlock the leaf (release the path) before doing all those checks,
since we do not need it anymore. In case we can not do a NOCOW write for
the extent, due to any of those checks failing, and the writeback range
goes beyond that extents' length, we will do another btree search for the
next file extent item.
The following script that calls dbench was used to measure the impact of
this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device
directly (no intermediary filesystem on the host) and using a non-debug
kernel (default configuration on Debian):
$ cat test-dbench.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-m single -d single"
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -D $MNT -t 300 64
umount $MNT
Before this change:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 9326331 0.317 399.957
Close 6851198 0.002 6.402
Rename 394894 2.621 402.819
Unlink 1883131 0.931 398.082
Deltree 256 19.160 303.580
Mkdir 128 0.003 0.016
Qpathinfo 8452314 0.068 116.133
Qfileinfo 1481921 0.001 5.081
Qfsinfo 1549963 0.002 4.444
Sfileinfo 759679 0.084 17.079
Find 3268168 0.396 118.196
WriteX 4653310 0.056 110.993
ReadX 14618818 0.005 23.314
LockX 30364 0.003 0.497
UnlockX 30364 0.002 1.720
Flush 653619 16.954 569.299
Throughput 966.651 MB/sec 64 clients 64 procs max_latency=569.377 ms
After this change:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 9710433 0.302 232.449
Close 7132948 0.002 11.496
Rename 411144 2.452 131.805
Unlink 1960961 0.893 230.383
Deltree 256 14.858 198.646
Mkdir 128 0.002 0.005
Qpathinfo 8800890 0.066 111.588
Qfileinfo 1542556 0.001 3.852
Qfsinfo 1613835 0.002 5.483
Sfileinfo 790871 0.081 19.492
Find 3402743 0.386 120.185
WriteX 4842918 0.054 179.312
ReadX 15220407 0.005 32.435
LockX 31612 0.003 1.533
UnlockX 31612 0.002 1.047
Flush 680567 16.320 463.323
Throughput 1016.59 MB/sec 64 clients 64 procs max_latency=463.327 ms
+5.0% throughput, -20.5% max latency
Also, the following test using fio was run:
$ cat test-fio.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 4 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
BLOCK_SIZE=$4
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=randwrite
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=$BLOCK_SIZE
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo
echo "Using fio config:"
echo
cat /tmp/fio-job.ini
echo
echo "mount options: $MOUNT_OPTIONS"
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
echo "Creating nodatacow files before fio runs..."
for ((i = 0; i < $NUM_JOBS; i++)); do
xfs_io -f -c "pwrite -b 128M 0 $FILE_SIZE" "$MNT/writers.$i.0"
done
sync
fio /tmp/fio-job.ini
umount $MNT
Before this change:
$ ./test-fio.sh 16 512M 2 4K
(...)
WRITE: bw=28.3MiB/s (29.6MB/s), 28.3MiB/s-28.3MiB/s (29.6MB/s-29.6MB/s), io=8192MiB (8590MB), run=289800-289800msec
After this change:
$ ./test-fio.sh 16 512M 2 4K
(...)
WRITE: bw=31.2MiB/s (32.7MB/s), 31.2MiB/s-31.2MiB/s (32.7MB/s-32.7MB/s), io=8192MiB (8590MB), run=262845-262845msec
+9.7% throughput, -9.8% runtime
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-18 11:00:17 +00:00
|
|
|
if (!path->nodes[0])
|
|
|
|
|
continue;
|
2008-10-30 14:20:02 -04:00
|
|
|
path->slots[0]++;
|
|
|
|
|
goto next_slot;
|
2008-08-05 13:05:02 -04:00
|
|
|
}
|
|
|
|
|
|
2019-08-21 10:42:57 +03:00
|
|
|
/*
|
|
|
|
|
* COW range from cow_start to found_key.offset - 1. As the key
|
|
|
|
|
* will contain the beginning of the first extent that can be
|
|
|
|
|
* NOCOW, following one which needs to be COW'ed
|
|
|
|
|
*/
|
2008-10-30 14:20:02 -04:00
|
|
|
if (cow_start != (u64)-1) {
|
2020-06-03 08:55:21 +03:00
|
|
|
ret = fallback_to_cow(inode, locked_page,
|
2020-06-03 08:55:20 +03:00
|
|
|
cow_start, found_key.offset - 1,
|
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
When doing a buffered write we always try to reserve data space for it,
even when the file has the NOCOW bit set or the write falls into a file
range covered by a prealloc extent. This is done both because it is
expensive to check if we can do a nocow write (checking if an extent is
shared through reflinks or if there's a hole in the range for example),
and because when writeback starts we might actually need to fallback to
COW mode (for example the block group containing the target extents was
turned into RO mode due to a scrub or balance).
When we are unable to reserve data space we check if we can do a nocow
write, and if we can, we proceed with dirtying the pages and setting up
the range for delalloc. In this case the bytes_may_use counter of the
data space_info object is not incremented, unlike in the case where we
are able to reserve data space (done through btrfs_check_data_free_space()
which calls btrfs_alloc_data_chunk_ondemand()).
Later when running delalloc we attempt to start writeback in nocow mode
but we might revert back to cow mode, for example because in the meanwhile
a block group was turned into RO mode by a scrub or relocation. The cow
path after successfully allocating an extent ends up calling
btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of
the data space_info object to have been incremented before - but we did
not do it when the buffered write started, since there was not enough
available data space. So btrfs_add_reserved_bytes() ends up decrementing
the bytes_may_use counter anyway, and when the counter's current value
is smaller then the size of the allocated extent we get a stack trace
like the following:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...)
CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
Workqueue: writeback wb_workfn (flush-btrfs-1754)
RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs]
Code: ff ff 48 (...)
RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287
RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410
RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000
R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400
R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800
FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
find_free_extent+0x4a0/0x16c0 [btrfs]
btrfs_reserve_extent+0x91/0x180 [btrfs]
cow_file_range+0x12d/0x490 [btrfs]
run_delalloc_nocow+0x341/0xa40 [btrfs]
btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
? find_lock_delalloc_range+0x221/0x250 [btrfs]
writepage_delalloc+0xe8/0x150 [btrfs]
__extent_writepage+0xe8/0x4c0 [btrfs]
extent_write_cache_pages+0x237/0x530 [btrfs]
? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs]
extent_writepages+0x44/0xa0 [btrfs]
do_writepages+0x23/0x80
__writeback_single_inode+0x59/0x700
writeback_sb_inodes+0x267/0x5f0
__writeback_inodes_wb+0x87/0xe0
wb_writeback+0x382/0x590
? wb_workfn+0x4a2/0x6c0
wb_workfn+0x4a2/0x6c0
process_one_work+0x26d/0x6a0
worker_thread+0x4f/0x3e0
? process_one_work+0x6a0/0x6a0
kthread+0x103/0x140
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x3a/0x50
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffff94ebdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace f9f6ef8ec4cd8ec9 ]---
So to fix this, when falling back into cow mode check if space was not
reserved, by testing for the bit EXTENT_NORESERVE in the respective file
range, and if not, increment the bytes_may_use counter for the data
space_info object. Also clear the EXTENT_NORESERVE bit from the range, so
that if the cow path fails it decrements the bytes_may_use counter when
clearing the delalloc range (through the btrfs_clear_delalloc_extent()
callback).
Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-27 11:16:07 +01:00
|
|
|
page_started, nr_written);
|
2020-07-06 09:14:12 -04:00
|
|
|
if (ret)
|
2012-03-12 16:03:00 +01:00
|
|
|
goto error;
|
2008-10-30 14:20:02 -04:00
|
|
|
cow_start = (u64)-1;
|
2008-08-05 13:05:02 -04:00
|
|
|
}
|
2008-10-30 14:20:02 -04:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_end = cur_offset + nocow_args.num_bytes - 1;
|
|
|
|
|
|
2008-10-30 14:25:28 -04:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2022-03-30 15:31:06 +01:00
|
|
|
u64 orig_start = found_key.offset - nocow_args.extent_offset;
|
2019-08-21 10:42:03 +03:00
|
|
|
struct extent_map *em;
|
2017-01-31 07:50:22 -08:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
|
2017-01-31 07:50:22 -08:00
|
|
|
orig_start,
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_args.disk_bytenr, /* block_start */
|
|
|
|
|
nocow_args.num_bytes, /* block_len */
|
|
|
|
|
nocow_args.disk_num_bytes, /* orig_block_len */
|
2017-01-31 07:50:22 -08:00
|
|
|
ram_bytes, BTRFS_COMPRESS_NONE,
|
|
|
|
|
BTRFS_ORDERED_PREALLOC);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto error;
|
2008-10-30 14:25:28 -04:00
|
|
|
}
|
2017-01-31 07:50:22 -08:00
|
|
|
free_extent_map(em);
|
2019-11-06 12:11:56 -08:00
|
|
|
ret = btrfs_add_ordered_extent(inode,
|
2022-03-30 15:31:06 +01:00
|
|
|
cur_offset, nocow_args.num_bytes,
|
|
|
|
|
nocow_args.num_bytes,
|
|
|
|
|
nocow_args.disk_bytenr,
|
|
|
|
|
nocow_args.num_bytes, 0,
|
2019-11-06 12:11:56 -08:00
|
|
|
1 << BTRFS_ORDERED_PREALLOC,
|
|
|
|
|
BTRFS_COMPRESS_NONE);
|
2019-08-22 17:24:20 +03:00
|
|
|
if (ret) {
|
2020-06-03 08:55:21 +03:00
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_end, 0);
|
2019-08-22 17:24:20 +03:00
|
|
|
goto error;
|
|
|
|
|
}
|
2008-10-30 14:25:28 -04:00
|
|
|
} else {
|
2020-06-03 08:55:21 +03:00
|
|
|
ret = btrfs_add_ordered_extent(inode, cur_offset,
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_args.num_bytes,
|
|
|
|
|
nocow_args.num_bytes,
|
|
|
|
|
nocow_args.disk_bytenr,
|
|
|
|
|
nocow_args.num_bytes,
|
2019-11-06 12:11:56 -08:00
|
|
|
0,
|
|
|
|
|
1 << BTRFS_ORDERED_NOCOW,
|
|
|
|
|
BTRFS_COMPRESS_NONE);
|
2019-08-22 17:24:20 +03:00
|
|
|
if (ret)
|
|
|
|
|
goto error;
|
2008-10-30 14:25:28 -04:00
|
|
|
}
|
2008-10-30 14:20:02 -04:00
|
|
|
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
if (nocow) {
|
|
|
|
|
btrfs_dec_nocow_writers(bg);
|
|
|
|
|
nocow = false;
|
|
|
|
|
}
|
2008-11-06 22:02:51 -05:00
|
|
|
|
2021-09-09 01:19:25 +09:00
|
|
|
if (btrfs_is_data_reloc_root(root))
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 10:25:51 +08:00
|
|
|
/*
|
|
|
|
|
* Error handled later, as we must prevent
|
|
|
|
|
* extent_clear_unlock_delalloc() in error handler
|
|
|
|
|
* from freeing metadata of created ordered extent.
|
|
|
|
|
*/
|
2020-06-03 08:55:21 +03:00
|
|
|
ret = btrfs_reloc_clone_csums(inode, cur_offset,
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_args.num_bytes);
|
2010-05-16 10:49:59 -04:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
|
2013-07-29 11:20:47 -04:00
|
|
|
locked_page, EXTENT_LOCKED |
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
EXTENT_DELALLOC |
|
|
|
|
|
EXTENT_CLEAR_DATA_RESV,
|
2021-04-07 19:22:13 +08:00
|
|
|
PAGE_UNLOCK | PAGE_SET_ORDERED);
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
|
2008-10-30 14:20:02 -04:00
|
|
|
cur_offset = extent_end;
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 10:25:51 +08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* btrfs_reloc_clone_csums() error, now we're OK to call error
|
|
|
|
|
* handler, as metadata for created ordered extent will only
|
|
|
|
|
* be freed by btrfs_finish_ordered_io().
|
|
|
|
|
*/
|
|
|
|
|
if (ret)
|
|
|
|
|
goto error;
|
2008-10-30 14:20:02 -04:00
|
|
|
if (cur_offset > end)
|
|
|
|
|
break;
|
2007-12-17 20:14:01 -05:00
|
|
|
}
|
2011-04-21 01:20:15 +02:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 14:20:02 -04:00
|
|
|
|
2018-10-30 18:04:04 +08:00
|
|
|
if (cur_offset <= end && cow_start == (u64)-1)
|
2008-10-30 14:20:02 -04:00
|
|
|
cow_start = cur_offset;
|
2012-05-31 15:58:55 -04:00
|
|
|
|
2008-10-30 14:20:02 -04:00
|
|
|
if (cow_start != (u64)-1) {
|
2018-10-30 18:04:04 +08:00
|
|
|
cur_offset = end;
|
2020-06-03 08:55:21 +03:00
|
|
|
ret = fallback_to_cow(inode, locked_page, cow_start, end,
|
|
|
|
|
page_started, nr_written);
|
2013-10-25 16:55:08 -04:00
|
|
|
if (ret)
|
2012-03-12 16:03:00 +01:00
|
|
|
goto error;
|
2008-10-30 14:20:02 -04:00
|
|
|
}
|
|
|
|
|
|
2012-03-12 16:03:00 +01:00
|
|
|
error:
|
2019-08-22 17:24:20 +03:00
|
|
|
if (nocow)
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
btrfs_dec_nocow_writers(bg);
|
2019-08-22 17:24:20 +03:00
|
|
|
|
2012-05-31 15:58:55 -04:00
|
|
|
if (ret && cur_offset < end)
|
2020-06-03 08:55:21 +03:00
|
|
|
extent_clear_unlock_delalloc(inode, cur_offset, end,
|
2013-07-29 11:20:47 -04:00
|
|
|
locked_page, EXTENT_LOCKED |
|
2013-07-29 13:22:24 -04:00
|
|
|
EXTENT_DELALLOC | EXTENT_DEFRAG |
|
|
|
|
|
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
|
2021-01-26 16:33:45 +08:00
|
|
|
PAGE_START_WRITEBACK |
|
2013-07-29 11:20:47 -04:00
|
|
|
PAGE_END_WRITEBACK);
|
2008-08-05 13:05:02 -04:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 16:03:00 +01:00
|
|
|
return ret;
|
2007-12-17 20:14:01 -05:00
|
|
|
}
|
|
|
|
|
|
2021-03-04 09:06:25 -06:00
|
|
|
static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
|
2014-07-03 18:22:07 +08:00
|
|
|
{
|
2021-03-04 09:06:25 -06:00
|
|
|
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
|
|
|
|
|
if (inode->defrag_bytes &&
|
|
|
|
|
test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
|
|
|
|
|
0, NULL))
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
2014-07-03 18:22:07 +08:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
2018-11-01 14:09:46 +02:00
|
|
|
* Function to process delayed allocation (create CoW) for ranges which are
|
|
|
|
|
* being touched for the first time.
|
2008-09-29 15:18:18 -04:00
|
|
|
*/
|
2020-06-03 08:55:29 +03:00
|
|
|
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
|
2018-11-01 14:09:46 +02:00
|
|
|
u64 start, u64 end, int *page_started, unsigned long *nr_written,
|
|
|
|
|
struct writeback_control *wbc)
|
2007-12-17 20:14:01 -05:00
|
|
|
{
|
|
|
|
|
int ret;
|
2021-02-04 19:22:07 +09:00
|
|
|
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
|
2008-06-25 16:01:30 -04:00
|
|
|
|
btrfs: subpage: avoid potential deadlock with compression and delalloc
[BUG]
With experimental subpage compression enabled, a simple fsstress can
lead to self deadlock on page 720896:
mkfs.btrfs -f -s 4k $dev > /dev/null
mount $dev -o compress $mnt
$fsstress -p 1 -n 100 -w -d $mnt -v -s 1625511156
[CAUSE]
If we have a file layout looks like below:
0 32K 64K 96K 128K
|//| |///////////////|
4K
Then we run delalloc range for the inode, it will:
- Call find_lock_delalloc_range() with @delalloc_start = 0
Then we got a delalloc range [0, 4K).
This range will be COWed.
- Call find_lock_delalloc_range() again with @delalloc_start = 4K
Since find_lock_delalloc_range() never cares whether the range
is still inside page range [0, 64K), it will return range [64K, 128K).
This range meets the condition for subpage compression, will go
through async COW path.
And async COW path will return @page_started.
But that @page_started is now for range [64K, 128K), not for range
[0, 64K).
- writepage_dellloc() returned 1 for page [0, 64K)
Thus page [0, 64K) will not be unlocked, nor its page dirty status
will be cleared.
Next time when we try to lock page [0, 64K) we will deadlock, as there
is no one to release page [0, 64K).
This problem will never happen for regular page size as one page only
contains one sector. After the first find_lock_delalloc_range() call,
the @delalloc_end will go beyond @page_end no matter if we found a
delalloc range or not
Thus this bug only happens for subpage, as now we need multiple runs to
exhaust the delalloc range of a page.
[FIX]
Fix the problem by ensuring the delalloc range we ran at least started
inside @locked_page.
So that we will never get incorrect @page_started.
And to prevent such problem from happening again:
- Make find_lock_delalloc_range() return false if the found range is
beyond @end value passed in.
Since @end will be utilized now, add an ASSERT() to ensure we pass
correct @end into find_lock_delalloc_range().
This also means, for selftests we needs to populate @end before calling
find_lock_delalloc_range().
- New ASSERT() in find_lock_delalloc_range()
Now we will make sure the @start/@end passed in at least covers part
of the page.
- New ASSERT() in run_delalloc_range()
To make sure the range at least starts inside @locked page.
- Use @delalloc_start as proper cursor, while @delalloc_end is always
reset to @page_end.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-27 15:22:07 +08:00
|
|
|
/*
|
|
|
|
|
* The range must cover part of the @locked_page, or the returned
|
|
|
|
|
* @page_started can confuse the caller.
|
|
|
|
|
*/
|
|
|
|
|
ASSERT(!(end <= page_offset(locked_page) ||
|
|
|
|
|
start >= page_offset(locked_page) + PAGE_SIZE));
|
|
|
|
|
|
2021-03-04 09:06:25 -06:00
|
|
|
if (should_nocow(inode, start, end)) {
|
2021-09-09 01:19:29 +09:00
|
|
|
/*
|
|
|
|
|
* Normally on a zoned device we're only doing COW writes, but
|
|
|
|
|
* in case of relocation on a zoned filesystem we have taken
|
|
|
|
|
* precaution, that we're only writing sequentially. It's safe
|
|
|
|
|
* to use run_delalloc_nocow() here, like for regular
|
|
|
|
|
* preallocated inodes.
|
|
|
|
|
*/
|
2022-03-23 09:45:58 +08:00
|
|
|
ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
|
2020-06-03 08:55:29 +03:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2021-03-04 09:06:25 -06:00
|
|
|
page_started, nr_written);
|
2022-04-15 16:04:05 +08:00
|
|
|
} else if (!btrfs_inode_can_compress(inode) ||
|
2020-06-03 08:55:29 +03:00
|
|
|
!inode_need_compress(inode, start, end)) {
|
2021-02-04 19:22:07 +09:00
|
|
|
if (zoned)
|
|
|
|
|
ret = run_delalloc_zoned(inode, locked_page, start, end,
|
|
|
|
|
page_started, nr_written);
|
|
|
|
|
else
|
|
|
|
|
ret = cow_file_range(inode, locked_page, start, end,
|
2022-07-09 08:18:49 +09:00
|
|
|
page_started, nr_written, 1, NULL);
|
2012-06-08 15:26:47 -04:00
|
|
|
} else {
|
2020-06-03 08:55:29 +03:00
|
|
|
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
|
|
|
|
|
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
|
2019-10-29 18:28:57 +01:00
|
|
|
page_started, nr_written);
|
2012-06-08 15:26:47 -04:00
|
|
|
}
|
2021-07-28 14:05:05 +08:00
|
|
|
ASSERT(ret <= 0);
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 10:25:52 +08:00
|
|
|
if (ret)
|
2020-06-03 08:55:29 +03:00
|
|
|
btrfs_cleanup_ordered_extents(inode, locked_page, start,
|
2018-11-21 17:10:52 +02:00
|
|
|
end - start + 1);
|
2007-08-27 16:49:44 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-01 14:09:53 +02:00
|
|
|
void btrfs_split_delalloc_extent(struct inode *inode,
|
|
|
|
|
struct extent_state *orig, u64 split)
|
2009-09-11 16:12:44 -04:00
|
|
|
{
|
btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size
On zoned filesystem, data write out is limited by max_zone_append_size,
and a large ordered extent is split according the size of a bio. OTOH,
the number of extents to be written is calculated using
BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the
metadata bytes to update and/or create the metadata items.
The metadata reservation is done at e.g, btrfs_buffered_write() and then
released according to the estimation changes. Thus, if the number of extent
increases massively, the reserved metadata can run out.
The increase of the number of extents easily occurs on zoned filesystem
if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the
following warning on a small RAM environment with disabling metadata
over-commit (in the following patch).
[75721.498492] ------------[ cut here ]------------
[75721.505624] BTRFS: block rsv 1 returned -28
[75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109
[75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021
[75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286
[75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000
[75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e
[75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7
[75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28
[75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a
[75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000
[75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0
[75721.730499] Call Trace:
[75721.735166] <TASK>
[75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs]
[75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs]
[75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.769520] ? push_leaf_left+0x420/0x620 [btrfs]
[75721.776431] ? memcpy+0x4e/0x60
[75721.781931] split_leaf+0x433/0x12d0 [btrfs]
[75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs]
[75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs]
[75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs]
[75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs]
[75721.818300] ? lock_downgrade+0x7c0/0x7c0
[75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs]
[75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs]
[75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs]
[75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs]
[75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs]
[75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs]
[75721.869313] ? rcu_read_lock_sched_held+0x16/0x80
[75721.876085] ? lock_release+0x552/0xf80
[75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs]
[75721.888886] ? __kasan_check_write+0x14/0x20
[75721.895152] ? do_raw_read_unlock+0x44/0x80
[75721.901323] ? _raw_write_lock_irq+0x60/0x80
[75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs]
[75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs]
[75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs]
[75721.929166] ? _raw_write_unlock+0x23/0x40
[75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs]
[75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs]
[75721.949906] ? try_to_wake_up+0x30/0x14a0
[75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs]
[75721.962661] ? rcu_read_lock_sched_held+0x16/0x80
[75721.969111] ? lock_acquire+0x41b/0x4c0
[75721.974982] finish_ordered_fn+0x15/0x20 [btrfs]
[75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs]
[75721.988184] ? _raw_spin_unlock_irq+0x28/0x50
[75721.994643] process_one_work+0x815/0x1460
[75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250
[75722.006643] ? do_raw_spin_trylock+0xbb/0x190
[75722.013086] worker_thread+0x59a/0xeb0
[75722.018511] kthread+0x2ac/0x360
[75722.023428] ? process_one_work+0x1460/0x1460
[75722.029431] ? kthread_complete_and_exit+0x30/0x30
[75722.036044] ret_from_fork+0x22/0x30
[75722.041255] </TASK>
[75722.045047] irq event stamp: 0
[75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[75722.057610] hardirqs last disabled at (0): [<ffffffff8118a94a>] copy_process+0x1c1a/0x66b0
[75722.067533] softirqs last enabled at (0): [<ffffffff8118a989>] copy_process+0x1c59/0x66b0
[75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0
[75722.085335] ---[ end trace 0000000000000000 ]---
To fix the estimation, we need to introduce fs_info->max_extent_size to
replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for
regular vs zoned filesystem.
Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned
filesystem, it is set to fs_info->max_zone_append_size.
CC: stable@vger.kernel.org # 5.12+
Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-07-09 08:18:40 +09:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2015-02-11 15:08:59 -05:00
|
|
|
u64 size;
|
|
|
|
|
|
2010-05-16 10:48:47 -04:00
|
|
|
/* not delalloc, ignore it */
|
2009-09-11 16:12:44 -04:00
|
|
|
if (!(orig->state & EXTENT_DELALLOC))
|
2011-07-21 16:56:09 +00:00
|
|
|
return;
|
2009-09-11 16:12:44 -04:00
|
|
|
|
2015-02-11 15:08:59 -05:00
|
|
|
size = orig->end - orig->start + 1;
|
btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size
On zoned filesystem, data write out is limited by max_zone_append_size,
and a large ordered extent is split according the size of a bio. OTOH,
the number of extents to be written is calculated using
BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the
metadata bytes to update and/or create the metadata items.
The metadata reservation is done at e.g, btrfs_buffered_write() and then
released according to the estimation changes. Thus, if the number of extent
increases massively, the reserved metadata can run out.
The increase of the number of extents easily occurs on zoned filesystem
if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the
following warning on a small RAM environment with disabling metadata
over-commit (in the following patch).
[75721.498492] ------------[ cut here ]------------
[75721.505624] BTRFS: block rsv 1 returned -28
[75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109
[75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021
[75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286
[75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000
[75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e
[75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7
[75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28
[75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a
[75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000
[75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0
[75721.730499] Call Trace:
[75721.735166] <TASK>
[75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs]
[75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs]
[75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.769520] ? push_leaf_left+0x420/0x620 [btrfs]
[75721.776431] ? memcpy+0x4e/0x60
[75721.781931] split_leaf+0x433/0x12d0 [btrfs]
[75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs]
[75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs]
[75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs]
[75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs]
[75721.818300] ? lock_downgrade+0x7c0/0x7c0
[75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs]
[75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs]
[75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs]
[75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs]
[75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs]
[75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs]
[75721.869313] ? rcu_read_lock_sched_held+0x16/0x80
[75721.876085] ? lock_release+0x552/0xf80
[75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs]
[75721.888886] ? __kasan_check_write+0x14/0x20
[75721.895152] ? do_raw_read_unlock+0x44/0x80
[75721.901323] ? _raw_write_lock_irq+0x60/0x80
[75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs]
[75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs]
[75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs]
[75721.929166] ? _raw_write_unlock+0x23/0x40
[75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs]
[75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs]
[75721.949906] ? try_to_wake_up+0x30/0x14a0
[75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs]
[75721.962661] ? rcu_read_lock_sched_held+0x16/0x80
[75721.969111] ? lock_acquire+0x41b/0x4c0
[75721.974982] finish_ordered_fn+0x15/0x20 [btrfs]
[75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs]
[75721.988184] ? _raw_spin_unlock_irq+0x28/0x50
[75721.994643] process_one_work+0x815/0x1460
[75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250
[75722.006643] ? do_raw_spin_trylock+0xbb/0x190
[75722.013086] worker_thread+0x59a/0xeb0
[75722.018511] kthread+0x2ac/0x360
[75722.023428] ? process_one_work+0x1460/0x1460
[75722.029431] ? kthread_complete_and_exit+0x30/0x30
[75722.036044] ret_from_fork+0x22/0x30
[75722.041255] </TASK>
[75722.045047] irq event stamp: 0
[75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[75722.057610] hardirqs last disabled at (0): [<ffffffff8118a94a>] copy_process+0x1c1a/0x66b0
[75722.067533] softirqs last enabled at (0): [<ffffffff8118a989>] copy_process+0x1c59/0x66b0
[75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0
[75722.085335] ---[ end trace 0000000000000000 ]---
To fix the estimation, we need to introduce fs_info->max_extent_size to
replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for
regular vs zoned filesystem.
Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned
filesystem, it is set to fs_info->max_zone_append_size.
CC: stable@vger.kernel.org # 5.12+
Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-07-09 08:18:40 +09:00
|
|
|
if (size > fs_info->max_extent_size) {
|
2017-01-04 11:09:51 +01:00
|
|
|
u32 num_extents;
|
2015-02-11 15:08:59 -05:00
|
|
|
u64 new_size;
|
|
|
|
|
|
|
|
|
|
/*
|
2018-11-01 14:09:52 +02:00
|
|
|
* See the explanation in btrfs_merge_delalloc_extent, the same
|
2015-03-13 15:01:24 -04:00
|
|
|
* applies here, just in reverse.
|
2015-02-11 15:08:59 -05:00
|
|
|
*/
|
|
|
|
|
new_size = orig->end - split + 1;
|
2022-07-09 08:18:41 +09:00
|
|
|
num_extents = count_max_extents(fs_info, new_size);
|
2015-03-13 15:01:24 -04:00
|
|
|
new_size = split - orig->start;
|
2022-07-09 08:18:41 +09:00
|
|
|
num_extents += count_max_extents(fs_info, new_size);
|
|
|
|
|
if (count_max_extents(fs_info, size) >= num_extents)
|
2015-02-11 15:08:59 -05:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2017-10-19 14:15:55 -04:00
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2009-09-11 16:12:44 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2018-11-01 14:09:52 +02:00
|
|
|
* Handle merged delayed allocation extents so we can keep track of new extents
|
|
|
|
|
* that are just merged onto old extents, such as when we are doing sequential
|
|
|
|
|
* writes, so we can properly account for the metadata space we'll need.
|
2009-09-11 16:12:44 -04:00
|
|
|
*/
|
2018-11-01 14:09:52 +02:00
|
|
|
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
|
|
|
|
|
struct extent_state *other)
|
2009-09-11 16:12:44 -04:00
|
|
|
{
|
btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size
On zoned filesystem, data write out is limited by max_zone_append_size,
and a large ordered extent is split according the size of a bio. OTOH,
the number of extents to be written is calculated using
BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the
metadata bytes to update and/or create the metadata items.
The metadata reservation is done at e.g, btrfs_buffered_write() and then
released according to the estimation changes. Thus, if the number of extent
increases massively, the reserved metadata can run out.
The increase of the number of extents easily occurs on zoned filesystem
if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the
following warning on a small RAM environment with disabling metadata
over-commit (in the following patch).
[75721.498492] ------------[ cut here ]------------
[75721.505624] BTRFS: block rsv 1 returned -28
[75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109
[75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021
[75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286
[75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000
[75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e
[75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7
[75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28
[75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a
[75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000
[75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0
[75721.730499] Call Trace:
[75721.735166] <TASK>
[75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs]
[75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs]
[75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.769520] ? push_leaf_left+0x420/0x620 [btrfs]
[75721.776431] ? memcpy+0x4e/0x60
[75721.781931] split_leaf+0x433/0x12d0 [btrfs]
[75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs]
[75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs]
[75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs]
[75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs]
[75721.818300] ? lock_downgrade+0x7c0/0x7c0
[75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs]
[75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs]
[75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs]
[75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs]
[75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs]
[75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs]
[75721.869313] ? rcu_read_lock_sched_held+0x16/0x80
[75721.876085] ? lock_release+0x552/0xf80
[75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs]
[75721.888886] ? __kasan_check_write+0x14/0x20
[75721.895152] ? do_raw_read_unlock+0x44/0x80
[75721.901323] ? _raw_write_lock_irq+0x60/0x80
[75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs]
[75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs]
[75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs]
[75721.929166] ? _raw_write_unlock+0x23/0x40
[75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs]
[75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs]
[75721.949906] ? try_to_wake_up+0x30/0x14a0
[75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs]
[75721.962661] ? rcu_read_lock_sched_held+0x16/0x80
[75721.969111] ? lock_acquire+0x41b/0x4c0
[75721.974982] finish_ordered_fn+0x15/0x20 [btrfs]
[75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs]
[75721.988184] ? _raw_spin_unlock_irq+0x28/0x50
[75721.994643] process_one_work+0x815/0x1460
[75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250
[75722.006643] ? do_raw_spin_trylock+0xbb/0x190
[75722.013086] worker_thread+0x59a/0xeb0
[75722.018511] kthread+0x2ac/0x360
[75722.023428] ? process_one_work+0x1460/0x1460
[75722.029431] ? kthread_complete_and_exit+0x30/0x30
[75722.036044] ret_from_fork+0x22/0x30
[75722.041255] </TASK>
[75722.045047] irq event stamp: 0
[75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[75722.057610] hardirqs last disabled at (0): [<ffffffff8118a94a>] copy_process+0x1c1a/0x66b0
[75722.067533] softirqs last enabled at (0): [<ffffffff8118a989>] copy_process+0x1c59/0x66b0
[75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0
[75722.085335] ---[ end trace 0000000000000000 ]---
To fix the estimation, we need to introduce fs_info->max_extent_size to
replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for
regular vs zoned filesystem.
Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned
filesystem, it is set to fs_info->max_zone_append_size.
CC: stable@vger.kernel.org # 5.12+
Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-07-09 08:18:40 +09:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2015-02-11 15:08:59 -05:00
|
|
|
u64 new_size, old_size;
|
2017-01-04 11:09:51 +01:00
|
|
|
u32 num_extents;
|
2015-02-11 15:08:59 -05:00
|
|
|
|
2009-09-11 16:12:44 -04:00
|
|
|
/* not delalloc, ignore it */
|
|
|
|
|
if (!(other->state & EXTENT_DELALLOC))
|
2011-07-21 16:56:09 +00:00
|
|
|
return;
|
2009-09-11 16:12:44 -04:00
|
|
|
|
2015-03-13 15:12:08 -04:00
|
|
|
if (new->start > other->start)
|
|
|
|
|
new_size = new->end - other->start + 1;
|
|
|
|
|
else
|
|
|
|
|
new_size = other->end - new->start + 1;
|
2015-02-11 15:08:59 -05:00
|
|
|
|
|
|
|
|
/* we're not bigger than the max, unreserve the space and go */
|
btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size
On zoned filesystem, data write out is limited by max_zone_append_size,
and a large ordered extent is split according the size of a bio. OTOH,
the number of extents to be written is calculated using
BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the
metadata bytes to update and/or create the metadata items.
The metadata reservation is done at e.g, btrfs_buffered_write() and then
released according to the estimation changes. Thus, if the number of extent
increases massively, the reserved metadata can run out.
The increase of the number of extents easily occurs on zoned filesystem
if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the
following warning on a small RAM environment with disabling metadata
over-commit (in the following patch).
[75721.498492] ------------[ cut here ]------------
[75721.505624] BTRFS: block rsv 1 returned -28
[75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109
[75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021
[75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs]
[75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286
[75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000
[75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e
[75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7
[75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28
[75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a
[75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000
[75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0
[75721.730499] Call Trace:
[75721.735166] <TASK>
[75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs]
[75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs]
[75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs]
[75721.769520] ? push_leaf_left+0x420/0x620 [btrfs]
[75721.776431] ? memcpy+0x4e/0x60
[75721.781931] split_leaf+0x433/0x12d0 [btrfs]
[75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs]
[75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs]
[75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs]
[75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs]
[75721.818300] ? lock_downgrade+0x7c0/0x7c0
[75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs]
[75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs]
[75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs]
[75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs]
[75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs]
[75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs]
[75721.869313] ? rcu_read_lock_sched_held+0x16/0x80
[75721.876085] ? lock_release+0x552/0xf80
[75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs]
[75721.888886] ? __kasan_check_write+0x14/0x20
[75721.895152] ? do_raw_read_unlock+0x44/0x80
[75721.901323] ? _raw_write_lock_irq+0x60/0x80
[75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs]
[75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs]
[75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs]
[75721.929166] ? _raw_write_unlock+0x23/0x40
[75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs]
[75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs]
[75721.949906] ? try_to_wake_up+0x30/0x14a0
[75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs]
[75721.962661] ? rcu_read_lock_sched_held+0x16/0x80
[75721.969111] ? lock_acquire+0x41b/0x4c0
[75721.974982] finish_ordered_fn+0x15/0x20 [btrfs]
[75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs]
[75721.988184] ? _raw_spin_unlock_irq+0x28/0x50
[75721.994643] process_one_work+0x815/0x1460
[75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250
[75722.006643] ? do_raw_spin_trylock+0xbb/0x190
[75722.013086] worker_thread+0x59a/0xeb0
[75722.018511] kthread+0x2ac/0x360
[75722.023428] ? process_one_work+0x1460/0x1460
[75722.029431] ? kthread_complete_and_exit+0x30/0x30
[75722.036044] ret_from_fork+0x22/0x30
[75722.041255] </TASK>
[75722.045047] irq event stamp: 0
[75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[75722.057610] hardirqs last disabled at (0): [<ffffffff8118a94a>] copy_process+0x1c1a/0x66b0
[75722.067533] softirqs last enabled at (0): [<ffffffff8118a989>] copy_process+0x1c59/0x66b0
[75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0
[75722.085335] ---[ end trace 0000000000000000 ]---
To fix the estimation, we need to introduce fs_info->max_extent_size to
replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for
regular vs zoned filesystem.
Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned
filesystem, it is set to fs_info->max_zone_append_size.
CC: stable@vger.kernel.org # 5.12+
Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-07-09 08:18:40 +09:00
|
|
|
if (new_size <= fs_info->max_extent_size) {
|
2015-02-11 15:08:59 -05:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2017-10-19 14:15:55 -04:00
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
|
2015-02-11 15:08:59 -05:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2015-03-13 15:01:24 -04:00
|
|
|
* We have to add up either side to figure out how many extents were
|
|
|
|
|
* accounted for before we merged into one big extent. If the number of
|
|
|
|
|
* extents we accounted for is <= the amount we need for the new range
|
|
|
|
|
* then we can return, otherwise drop. Think of it like this
|
|
|
|
|
*
|
|
|
|
|
* [ 4k][MAX_SIZE]
|
|
|
|
|
*
|
|
|
|
|
* So we've grown the extent by a MAX_SIZE extent, this would mean we
|
|
|
|
|
* need 2 outstanding extents, on one side we have 1 and the other side
|
|
|
|
|
* we have 1 so they are == and we can return. But in this case
|
|
|
|
|
*
|
|
|
|
|
* [MAX_SIZE+4k][MAX_SIZE+4k]
|
|
|
|
|
*
|
|
|
|
|
* Each range on their own accounts for 2 extents, but merged together
|
|
|
|
|
* they are only 3 extents worth of accounting, so we need to drop in
|
|
|
|
|
* this case.
|
2015-02-11 15:08:59 -05:00
|
|
|
*/
|
2015-03-13 15:01:24 -04:00
|
|
|
old_size = other->end - other->start + 1;
|
2022-07-09 08:18:41 +09:00
|
|
|
num_extents = count_max_extents(fs_info, old_size);
|
2015-03-13 15:01:24 -04:00
|
|
|
old_size = new->end - new->start + 1;
|
2022-07-09 08:18:41 +09:00
|
|
|
num_extents += count_max_extents(fs_info, old_size);
|
|
|
|
|
if (count_max_extents(fs_info, new_size) >= num_extents)
|
2015-02-11 15:08:59 -05:00
|
|
|
return;
|
|
|
|
|
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2017-10-19 14:15:55 -04:00
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2009-09-11 16:12:44 -04:00
|
|
|
}
|
|
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
|
|
|
|
|
struct inode *inode)
|
|
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
|
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
|
|
|
|
|
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
|
|
|
|
|
&root->delalloc_inodes);
|
|
|
|
|
set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
|
root->nr_delalloc_inodes++;
|
|
|
|
|
if (root->nr_delalloc_inodes == 1) {
|
2016-06-22 18:54:23 -04:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
BUG_ON(!list_empty(&root->delalloc_root));
|
|
|
|
|
list_add_tail(&root->delalloc_root,
|
2016-06-22 18:54:23 -04:00
|
|
|
&fs_info->delalloc_roots);
|
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-27 12:21:51 +03:00
|
|
|
|
|
|
|
|
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
|
|
|
|
|
struct btrfs_inode *inode)
|
2013-05-15 07:48:22 +00:00
|
|
|
{
|
2018-06-29 10:56:42 +02:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2016-06-22 18:54:23 -04:00
|
|
|
|
2017-02-20 13:51:07 +02:00
|
|
|
if (!list_empty(&inode->delalloc_inodes)) {
|
|
|
|
|
list_del_init(&inode->delalloc_inodes);
|
2013-05-15 07:48:22 +00:00
|
|
|
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2017-02-20 13:51:07 +02:00
|
|
|
&inode->runtime_flags);
|
2013-05-15 07:48:22 +00:00
|
|
|
root->nr_delalloc_inodes--;
|
|
|
|
|
if (!root->nr_delalloc_inodes) {
|
2018-04-27 12:21:52 +03:00
|
|
|
ASSERT(list_empty(&root->delalloc_inodes));
|
2016-06-22 18:54:23 -04:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
BUG_ON(list_empty(&root->delalloc_root));
|
|
|
|
|
list_del_init(&root->delalloc_root);
|
2016-06-22 18:54:23 -04:00
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
|
}
|
2018-04-27 12:21:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void btrfs_del_delalloc_inode(struct btrfs_root *root,
|
|
|
|
|
struct btrfs_inode *inode)
|
|
|
|
|
{
|
|
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
|
__btrfs_del_delalloc_inode(root, inode);
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
2018-11-01 14:09:50 +02:00
|
|
|
* Properly track delayed allocation bytes in the inode and to maintain the
|
|
|
|
|
* list of inodes that have pending delalloc work to be done.
|
2008-09-29 15:18:18 -04:00
|
|
|
*/
|
2018-11-01 14:09:50 +02:00
|
|
|
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
|
2020-06-25 17:54:54 +02:00
|
|
|
u32 bits)
|
2008-01-29 15:55:23 -05:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
|
2020-06-25 17:54:54 +02:00
|
|
|
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
|
2014-07-03 18:22:07 +08:00
|
|
|
WARN_ON(1);
|
2008-12-15 15:54:40 -05:00
|
|
|
/*
|
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-20 20:20:32 +00:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-15 15:54:40 -05:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
|
*/
|
2020-06-25 17:54:54 +02:00
|
|
|
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
|
2008-01-29 15:55:23 -05:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 10:48:47 -04:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2022-07-09 08:18:41 +09:00
|
|
|
u32 num_extents = count_max_extents(fs_info, len);
|
2017-02-20 13:50:35 +02:00
|
|
|
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
|
2009-09-11 16:12:44 -04:00
|
|
|
|
2017-10-19 14:15:55 -04:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
|
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2010-03-19 18:07:23 +00:00
|
|
|
|
2015-03-16 17:38:52 -04:00
|
|
|
/* For sanity tests */
|
2016-06-22 18:54:23 -04:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2015-03-16 17:38:52 -04:00
|
|
|
return;
|
|
|
|
|
|
2017-06-20 21:01:20 +03:00
|
|
|
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
|
|
|
|
|
fs_info->delalloc_batch);
|
2013-01-29 10:11:59 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2010-05-16 10:48:47 -04:00
|
|
|
BTRFS_I(inode)->delalloc_bytes += len;
|
2020-06-25 17:54:54 +02:00
|
|
|
if (bits & EXTENT_DEFRAG)
|
2014-07-03 18:22:07 +08:00
|
|
|
BTRFS_I(inode)->defrag_bytes += len;
|
2013-01-29 10:11:59 +00:00
|
|
|
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2013-05-15 07:48:22 +00:00
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
|
|
|
|
btrfs_add_delalloc_inodes(root, inode);
|
2013-01-29 10:11:59 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2008-01-29 15:55:23 -05:00
|
|
|
}
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
|
|
|
|
|
if (!(state->state & EXTENT_DELALLOC_NEW) &&
|
2020-06-25 17:54:54 +02:00
|
|
|
(bits & EXTENT_DELALLOC_NEW)) {
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
|
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
|
|
|
|
|
state->start;
|
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
|
}
|
2008-01-29 15:55:23 -05:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
2018-11-01 14:09:51 +02:00
|
|
|
* Once a range is no longer delalloc this function ensures that proper
|
|
|
|
|
* accounting happens.
|
2008-09-29 15:18:18 -04:00
|
|
|
*/
|
2018-11-01 14:09:51 +02:00
|
|
|
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
|
2020-06-25 17:54:54 +02:00
|
|
|
struct extent_state *state, u32 bits)
|
2008-01-29 15:55:23 -05:00
|
|
|
{
|
2018-11-01 14:09:51 +02:00
|
|
|
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
|
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
|
2014-07-03 18:22:07 +08:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2022-07-09 08:18:41 +09:00
|
|
|
u32 num_extents = count_max_extents(fs_info, len);
|
2014-07-03 18:22:07 +08:00
|
|
|
|
2020-06-25 17:54:54 +02:00
|
|
|
if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
|
2017-07-27 19:52:55 +01:00
|
|
|
spin_lock(&inode->lock);
|
2017-02-20 13:51:03 +02:00
|
|
|
inode->defrag_bytes -= len;
|
2017-07-27 19:52:55 +01:00
|
|
|
spin_unlock(&inode->lock);
|
|
|
|
|
}
|
2014-07-03 18:22:07 +08:00
|
|
|
|
2008-12-15 15:54:40 -05:00
|
|
|
/*
|
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-20 20:20:32 +00:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-15 15:54:40 -05:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
|
*/
|
2020-06-25 17:54:54 +02:00
|
|
|
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
|
2017-02-20 13:51:03 +02:00
|
|
|
struct btrfs_root *root = inode->root;
|
2012-07-10 05:28:39 -06:00
|
|
|
bool do_list = !btrfs_is_free_space_inode(inode);
|
2008-04-22 13:26:47 -04:00
|
|
|
|
2017-10-19 14:15:55 -04:00
|
|
|
spin_lock(&inode->lock);
|
|
|
|
|
btrfs_mod_outstanding_extents(inode, -num_extents);
|
|
|
|
|
spin_unlock(&inode->lock);
|
2010-05-16 10:48:47 -04:00
|
|
|
|
2013-09-27 14:57:43 -04:00
|
|
|
/*
|
|
|
|
|
* We don't reserve metadata space for space cache inodes so we
|
2018-11-28 12:05:13 +01:00
|
|
|
* don't need to call delalloc_release_metadata if there is an
|
2013-09-27 14:57:43 -04:00
|
|
|
* error.
|
|
|
|
|
*/
|
2020-06-25 17:54:54 +02:00
|
|
|
if (bits & EXTENT_CLEAR_META_RESV &&
|
2016-06-22 18:54:23 -04:00
|
|
|
root != fs_info->tree_root)
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:32 +08:00
|
|
|
btrfs_delalloc_release_metadata(inode, len, false);
|
2010-05-16 10:48:47 -04:00
|
|
|
|
2015-03-16 17:38:52 -04:00
|
|
|
/* For sanity tests. */
|
2016-06-22 18:54:23 -04:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2015-03-16 17:38:52 -04:00
|
|
|
return;
|
|
|
|
|
|
2021-09-09 01:19:25 +09:00
|
|
|
if (!btrfs_is_data_reloc_root(root) &&
|
2017-03-06 23:04:20 +00:00
|
|
|
do_list && !(state->state & EXTENT_NORESERVE) &&
|
2020-06-25 17:54:54 +02:00
|
|
|
(bits & EXTENT_CLEAR_DATA_RESV))
|
2020-06-03 08:55:38 +03:00
|
|
|
btrfs_free_reserved_data_space_noquota(fs_info, len);
|
2009-09-11 16:12:44 -04:00
|
|
|
|
2017-06-20 21:01:20 +03:00
|
|
|
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
|
|
|
|
|
fs_info->delalloc_batch);
|
2017-02-20 13:51:03 +02:00
|
|
|
spin_lock(&inode->lock);
|
|
|
|
|
inode->delalloc_bytes -= len;
|
|
|
|
|
if (do_list && inode->delalloc_bytes == 0 &&
|
2013-01-29 10:11:59 +00:00
|
|
|
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2017-02-20 13:51:07 +02:00
|
|
|
&inode->runtime_flags))
|
2013-05-15 07:48:22 +00:00
|
|
|
btrfs_del_delalloc_inode(root, inode);
|
2017-02-20 13:51:03 +02:00
|
|
|
spin_unlock(&inode->lock);
|
2008-01-29 15:55:23 -05:00
|
|
|
}
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
|
|
|
|
|
if ((state->state & EXTENT_DELALLOC_NEW) &&
|
2020-06-25 17:54:54 +02:00
|
|
|
(bits & EXTENT_DELALLOC_NEW)) {
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
spin_lock(&inode->lock);
|
|
|
|
|
ASSERT(inode->new_delalloc_bytes >= len);
|
|
|
|
|
inode->new_delalloc_bytes -= len;
|
2020-06-25 17:54:54 +02:00
|
|
|
if (bits & EXTENT_ADD_INODE_BYTES)
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
inode_add_bytes(&inode->vfs_inode, len);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
spin_unlock(&inode->lock);
|
|
|
|
|
}
|
2008-01-29 15:55:23 -05:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
|
*
|
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
|
* are inserted into the btree
|
|
|
|
|
*/
|
2020-10-21 14:24:53 +08:00
|
|
|
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
|
2020-12-02 14:47:57 +08:00
|
|
|
u64 dio_file_offset)
|
2008-02-20 12:07:25 -05:00
|
|
|
{
|
2019-11-06 15:38:43 -08:00
|
|
|
return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-06 22:03:00 -05:00
|
|
|
}
|
2008-04-16 11:15:20 -04:00
|
|
|
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
/*
|
|
|
|
|
* Split an extent_map at [start, start + len]
|
|
|
|
|
*
|
|
|
|
|
* This function is intended to be used only for extract_ordered_extent().
|
|
|
|
|
*/
|
|
|
|
|
static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
|
|
|
|
u64 pre, u64 post)
|
|
|
|
|
{
|
|
|
|
|
struct extent_map_tree *em_tree = &inode->extent_tree;
|
|
|
|
|
struct extent_map *em;
|
|
|
|
|
struct extent_map *split_pre = NULL;
|
|
|
|
|
struct extent_map *split_mid = NULL;
|
|
|
|
|
struct extent_map *split_post = NULL;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
|
|
/* Sanity check */
|
|
|
|
|
if (pre == 0 && post == 0)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
split_pre = alloc_extent_map();
|
|
|
|
|
if (pre)
|
|
|
|
|
split_mid = alloc_extent_map();
|
|
|
|
|
if (post)
|
|
|
|
|
split_post = alloc_extent_map();
|
|
|
|
|
if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(pre + post < len);
|
|
|
|
|
|
|
|
|
|
lock_extent(&inode->io_tree, start, start + len - 1);
|
|
|
|
|
write_lock(&em_tree->lock);
|
|
|
|
|
em = lookup_extent_mapping(em_tree, start, len);
|
|
|
|
|
if (!em) {
|
|
|
|
|
ret = -EIO;
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(em->len == len);
|
|
|
|
|
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
|
|
|
|
|
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
|
2021-08-09 09:29:18 +09:00
|
|
|
ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
|
|
|
|
|
ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
|
|
|
|
|
ASSERT(!list_empty(&em->list));
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
|
|
|
|
|
flags = em->flags;
|
|
|
|
|
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
|
|
|
|
|
|
/* First, replace the em with a new extent_map starting from * em->start */
|
|
|
|
|
split_pre->start = em->start;
|
|
|
|
|
split_pre->len = (pre ? pre : em->len - post);
|
|
|
|
|
split_pre->orig_start = split_pre->start;
|
|
|
|
|
split_pre->block_start = em->block_start;
|
|
|
|
|
split_pre->block_len = split_pre->len;
|
|
|
|
|
split_pre->orig_block_len = split_pre->block_len;
|
|
|
|
|
split_pre->ram_bytes = split_pre->len;
|
|
|
|
|
split_pre->flags = flags;
|
|
|
|
|
split_pre->compress_type = em->compress_type;
|
|
|
|
|
split_pre->generation = em->generation;
|
|
|
|
|
|
2021-08-09 09:29:18 +09:00
|
|
|
replace_extent_mapping(em_tree, em, split_pre, 1);
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Now we only have an extent_map at:
|
|
|
|
|
* [em->start, em->start + pre] if pre != 0
|
|
|
|
|
* [em->start, em->start + em->len - post] if pre == 0
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
if (pre) {
|
|
|
|
|
/* Insert the middle extent_map */
|
|
|
|
|
split_mid->start = em->start + pre;
|
|
|
|
|
split_mid->len = em->len - pre - post;
|
|
|
|
|
split_mid->orig_start = split_mid->start;
|
|
|
|
|
split_mid->block_start = em->block_start + pre;
|
|
|
|
|
split_mid->block_len = split_mid->len;
|
|
|
|
|
split_mid->orig_block_len = split_mid->block_len;
|
|
|
|
|
split_mid->ram_bytes = split_mid->len;
|
|
|
|
|
split_mid->flags = flags;
|
|
|
|
|
split_mid->compress_type = em->compress_type;
|
|
|
|
|
split_mid->generation = em->generation;
|
2021-08-09 09:29:18 +09:00
|
|
|
add_extent_mapping(em_tree, split_mid, 1);
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (post) {
|
|
|
|
|
split_post->start = em->start + em->len - post;
|
|
|
|
|
split_post->len = post;
|
|
|
|
|
split_post->orig_start = split_post->start;
|
|
|
|
|
split_post->block_start = em->block_start + em->len - post;
|
|
|
|
|
split_post->block_len = split_post->len;
|
|
|
|
|
split_post->orig_block_len = split_post->block_len;
|
|
|
|
|
split_post->ram_bytes = split_post->len;
|
|
|
|
|
split_post->flags = flags;
|
|
|
|
|
split_post->compress_type = em->compress_type;
|
|
|
|
|
split_post->generation = em->generation;
|
2021-08-09 09:29:18 +09:00
|
|
|
add_extent_mapping(em_tree, split_post, 1);
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Once for us */
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
/* Once for the tree */
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
|
unlock_extent(&inode->io_tree, start, start + len - 1);
|
|
|
|
|
out:
|
|
|
|
|
free_extent_map(split_pre);
|
|
|
|
|
free_extent_map(split_mid);
|
|
|
|
|
free_extent_map(split_post);
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-04 19:22:00 +09:00
|
|
|
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
|
|
|
|
|
struct bio *bio, loff_t file_offset)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
|
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
u64 file_len;
|
2021-02-04 19:22:00 +09:00
|
|
|
u64 len = bio->bi_iter.bi_size;
|
|
|
|
|
u64 end = start + len;
|
|
|
|
|
u64 ordered_end;
|
|
|
|
|
u64 pre, post;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
|
|
|
|
|
if (WARN_ON_ONCE(!ordered))
|
|
|
|
|
return BLK_STS_IOERR;
|
|
|
|
|
|
|
|
|
|
/* No need to split */
|
|
|
|
|
if (ordered->disk_num_bytes == len)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
/* We cannot split once end_bio'd ordered extent */
|
|
|
|
|
if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We cannot split a compressed ordered extent */
|
|
|
|
|
if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
|
|
|
|
|
/* bio must be in one ordered extent */
|
|
|
|
|
if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Checksum list should be empty */
|
|
|
|
|
if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
file_len = ordered->num_bytes;
|
2021-02-04 19:22:00 +09:00
|
|
|
pre = start - ordered->disk_bytenr;
|
|
|
|
|
post = ordered_end - end;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_split_ordered_extent(ordered, pre, post);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
Damien reported a test failure with btrfs/209. The test itself ran fine,
but the fsck ran afterwards reported a corrupted filesystem.
The filesystem corruption happens because we're splitting an extent and
then writing the extent twice. We have to split the extent though, because
we're creating too large extents for a REQ_OP_ZONE_APPEND operation.
When dumping the extent tree, we can see two EXTENT_ITEMs at the same
start address but different lengths.
$ btrfs inspect dump-tree /dev/nullb1 -t extent
...
item 19 key (269484032 EXTENT_ITEM 126976) itemoff 15470 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
item 20 key (269484032 EXTENT_ITEM 262144) itemoff 15417 itemsize 53
refs 1 gen 7 flags DATA
extent data backref root FS_TREE objectid 257 offset 786432 count 1
The duplicated EXTENT_ITEMs originally come from wrongly split extent_map in
extract_ordered_extent(). Since extract_ordered_extent() uses
create_io_em() to split an existing extent_map, we will have
split->orig_start != split->start. Then, it will be logged with non-zero
"extent data offset". Finally, the logged entries are replayed into
a duplicated EXTENT_ITEM.
Introduce and use proper splitting function for extent_map. The function is
intended to be simple and specific usage for extract_ordered_extent() e.g.
not supporting compression case (we do not allow splitting compressed
extent_map anyway).
There was a question raised by Qu, in summary why we want to split the
extent map (and not the bio):
The problem is not the limit on the zone end, which as you mention is
the same as the block group end. The problem is that data write use zone
append (ZA) operations. ZA BIOs cannot be split so a large extent may
need to be processed with multiple ZA BIOs, While that is also true for
regular writes, the major difference is that ZA are "nameless" write
operation giving back the written sectors on completion. And ZA
operations may be reordered by the block layer (not intentionally
though). Combine both of these characteristics and you can see that the
data for a large extent may end up being shuffled when written resulting
in data corruption and the impossibility to map the extent to some start
sector.
To avoid this problem, zoned btrfs uses the principle "one data extent
== one ZA BIO". So large extents need to be split. This is unfortunate,
but we can revisit this later and optimize, e.g. merge back together the
fragments of an extent once written if they actually were written
sequentially in the zone.
Reported-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d22002fd37bd ("btrfs: zoned: split ordered extent when bio is sent")
CC: stable@vger.kernel.org # 5.12+
CC: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-28 17:57:28 +09:00
|
|
|
ret = split_zoned_em(inode, file_offset, file_len, pre, post);
|
2021-02-04 19:22:00 +09:00
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
|
|
|
|
|
return errno_to_blk_status(ret);
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 09:36:35 +02:00
|
|
|
void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
|
2008-04-16 11:14:51 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2022-05-26 09:36:35 +02:00
|
|
|
struct btrfs_inode *bi = BTRFS_I(inode);
|
|
|
|
|
blk_status_t ret;
|
2011-10-03 23:23:12 -04:00
|
|
|
|
2021-02-04 19:22:00 +09:00
|
|
|
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
2022-05-26 09:36:35 +02:00
|
|
|
ret = extract_ordered_extent(bi, bio,
|
|
|
|
|
page_offset(bio_first_bvec_all(bio)->bv_page));
|
2022-08-06 10:03:26 +02:00
|
|
|
if (ret) {
|
|
|
|
|
btrfs_bio_end_io(btrfs_bio(bio), ret);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-02-04 19:22:00 +09:00
|
|
|
}
|
|
|
|
|
|
2022-05-26 09:36:35 +02:00
|
|
|
/*
|
2022-06-17 12:04:11 +02:00
|
|
|
* If we need to checksum, and the I/O is not issued by fsync and
|
|
|
|
|
* friends, that is ->sync_writers != 0, defer the submission to a
|
|
|
|
|
* workqueue to parallelize it.
|
|
|
|
|
*
|
|
|
|
|
* Csum items for reloc roots have already been cloned at this point,
|
|
|
|
|
* so they are handled as part of the no-checksum case.
|
2022-05-26 09:36:35 +02:00
|
|
|
*/
|
|
|
|
|
if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
|
2022-06-17 12:04:11 +02:00
|
|
|
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
|
|
|
|
|
!btrfs_is_data_reloc_root(bi->root)) {
|
2022-06-17 12:04:12 +02:00
|
|
|
if (!atomic_read(&bi->sync_writers) &&
|
|
|
|
|
btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
|
|
|
|
|
btrfs_submit_bio_start))
|
2022-04-15 16:33:28 +02:00
|
|
|
return;
|
2022-06-17 12:04:11 +02:00
|
|
|
|
|
|
|
|
ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
|
2022-08-06 10:03:26 +02:00
|
|
|
if (ret) {
|
|
|
|
|
btrfs_bio_end_io(btrfs_bio(bio), ret);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2008-10-30 14:23:13 -04:00
|
|
|
}
|
2022-06-17 12:04:07 +02:00
|
|
|
btrfs_submit_bio(fs_info, bio, mirror_num);
|
2022-05-26 09:36:35 +02:00
|
|
|
}
|
2008-10-30 14:23:13 -04:00
|
|
|
|
2022-05-26 09:36:35 +02:00
|
|
|
void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
|
|
|
|
|
int mirror_num, enum btrfs_compression_type compress_type)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
blk_status_t ret;
|
2012-11-05 18:51:52 +01:00
|
|
|
|
2022-05-26 09:36:35 +02:00
|
|
|
if (compress_type != BTRFS_COMPRESS_NONE) {
|
|
|
|
|
/*
|
|
|
|
|
* btrfs_submit_compressed_read will handle completing the bio
|
|
|
|
|
* if there were any errors, so just return here.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_submit_compressed_read(inode, bio, mirror_num);
|
|
|
|
|
return;
|
2008-10-30 14:23:13 -04:00
|
|
|
}
|
|
|
|
|
|
2022-07-07 07:33:30 +02:00
|
|
|
/* Save the original iter for read repair */
|
|
|
|
|
btrfs_bio(bio)->iter = bio->bi_iter;
|
|
|
|
|
|
2022-05-26 09:36:35 +02:00
|
|
|
/*
|
|
|
|
|
* Lookup bio sums does extra checks around whether we need to csum or
|
|
|
|
|
* not, which is why we ignore skip_sum here.
|
|
|
|
|
*/
|
|
|
|
|
ret = btrfs_lookup_bio_sums(inode, bio, NULL);
|
2017-06-03 09:38:06 +02:00
|
|
|
if (ret) {
|
2022-08-06 10:03:26 +02:00
|
|
|
btrfs_bio_end_io(btrfs_bio(bio), ret);
|
2022-06-17 12:04:07 +02:00
|
|
|
return;
|
2015-07-20 15:29:37 +02:00
|
|
|
}
|
2022-06-17 12:04:07 +02:00
|
|
|
|
|
|
|
|
btrfs_submit_bio(fs_info, bio, mirror_num);
|
2008-02-20 12:07:25 -05:00
|
|
|
}
|
2008-02-20 16:11:05 -05:00
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* given a list of ordered sums record them in the inode. This happens
|
|
|
|
|
* at IO completion time based on sums calculated at bio submission time.
|
|
|
|
|
*/
|
2020-09-18 12:15:52 +03:00
|
|
|
static int add_pending_csums(struct btrfs_trans_handle *trans,
|
|
|
|
|
struct list_head *list)
|
2008-07-17 12:53:50 -04:00
|
|
|
{
|
|
|
|
|
struct btrfs_ordered_sum *sum;
|
2021-11-05 16:45:48 -04:00
|
|
|
struct btrfs_root *csum_root = NULL;
|
2018-01-08 10:59:43 +02:00
|
|
|
int ret;
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2009-01-21 10:59:08 -05:00
|
|
|
list_for_each_entry(sum, list, list) {
|
2017-11-08 01:07:43 +01:00
|
|
|
trans->adding_csums = true;
|
2021-11-05 16:45:48 -04:00
|
|
|
if (!csum_root)
|
|
|
|
|
csum_root = btrfs_csum_root(trans->fs_info,
|
|
|
|
|
sum->bytenr);
|
|
|
|
|
ret = btrfs_csum_file_blocks(trans, csum_root, sum);
|
2017-11-08 01:07:43 +01:00
|
|
|
trans->adding_csums = false;
|
2018-01-08 10:59:43 +02:00
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2008-07-17 12:53:50 -04:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: fix missing delalloc new bit for new delalloc ranges
When doing a buffered write, through one of the write family syscalls, we
look for ranges which currently don't have allocated extents and set the
'delalloc new' bit on them, so that we can report a correct number of used
blocks to the stat(2) syscall until delalloc is flushed and ordered extents
complete.
However there are a few other places where we can do a buffered write
against a range that is mapped to a hole (no extent allocated) and where
we do not set the 'new delalloc' bit. Those places are:
- Doing a memory mapped write against a hole;
- Cloning an inline extent into a hole starting at file offset 0;
- Calling btrfs_cont_expand() when the i_size of the file is not aligned
to the sector size and is located in a hole. For example when cloning
to a destination offset beyond EOF.
So after such cases, until the corresponding delalloc range is flushed and
the respective ordered extents complete, we can report an incorrect number
of blocks used through the stat(2) syscall.
In some cases we can end up reporting 0 used blocks to stat(2), which is a
particular bad value to report as it may mislead tools to think a file is
completely sparse when its i_size is not zero, making them skip reading
any data, an undesired consequence for tools such as archivers and other
backup tools, as reported a long time ago in the following thread (and
other past threads):
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
Example reproducer:
$ cat reproducer.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -c "truncate 64K" \
-c "mmap -w 0 64K" \
-c "mwrite -S 0xab 0 64K" \
-c "munmap" \
$MNT/foo
blocks_used=$(stat -c %b $MNT/foo)
echo "blocks used: $blocks_used"
if [ $blocks_used -eq 0 ]; then
echo "ERROR: blocks used is 0"
fi
umount $DEV
$ ./reproducer.sh
blocks used: 0
ERROR: blocks used is 0
So move the logic that decides to set the 'delalloc bit' bit into the
function btrfs_set_extent_delalloc(), since that is what we use for all
those missing cases as well as for the cases that currently work well.
This change is also preparatory work for an upcoming patch that fixes
other problems related to tracking and reporting the number of bytes used
by an inode.
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:31 +00:00
|
|
|
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
|
|
|
|
|
const u64 start,
|
|
|
|
|
const u64 len,
|
|
|
|
|
struct extent_state **cached_state)
|
|
|
|
|
{
|
|
|
|
|
u64 search_start = start;
|
|
|
|
|
const u64 end = start + len - 1;
|
|
|
|
|
|
|
|
|
|
while (search_start < end) {
|
|
|
|
|
const u64 search_len = end - search_start + 1;
|
|
|
|
|
struct extent_map *em;
|
|
|
|
|
u64 em_len;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
|
|
|
|
|
if (IS_ERR(em))
|
|
|
|
|
return PTR_ERR(em);
|
|
|
|
|
|
|
|
|
|
if (em->block_start != EXTENT_MAP_HOLE)
|
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
|
|
em_len = em->len;
|
|
|
|
|
if (em->start < search_start)
|
|
|
|
|
em_len -= search_start - em->start;
|
|
|
|
|
if (em_len > search_len)
|
|
|
|
|
em_len = search_len;
|
|
|
|
|
|
|
|
|
|
ret = set_extent_bit(&inode->io_tree, search_start,
|
|
|
|
|
search_start + em_len - 1,
|
2020-11-05 11:08:00 +02:00
|
|
|
EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
|
|
|
|
|
GFP_NOFS, NULL);
|
btrfs: fix missing delalloc new bit for new delalloc ranges
When doing a buffered write, through one of the write family syscalls, we
look for ranges which currently don't have allocated extents and set the
'delalloc new' bit on them, so that we can report a correct number of used
blocks to the stat(2) syscall until delalloc is flushed and ordered extents
complete.
However there are a few other places where we can do a buffered write
against a range that is mapped to a hole (no extent allocated) and where
we do not set the 'new delalloc' bit. Those places are:
- Doing a memory mapped write against a hole;
- Cloning an inline extent into a hole starting at file offset 0;
- Calling btrfs_cont_expand() when the i_size of the file is not aligned
to the sector size and is located in a hole. For example when cloning
to a destination offset beyond EOF.
So after such cases, until the corresponding delalloc range is flushed and
the respective ordered extents complete, we can report an incorrect number
of blocks used through the stat(2) syscall.
In some cases we can end up reporting 0 used blocks to stat(2), which is a
particular bad value to report as it may mislead tools to think a file is
completely sparse when its i_size is not zero, making them skip reading
any data, an undesired consequence for tools such as archivers and other
backup tools, as reported a long time ago in the following thread (and
other past threads):
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
Example reproducer:
$ cat reproducer.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -c "truncate 64K" \
-c "mmap -w 0 64K" \
-c "mwrite -S 0xab 0 64K" \
-c "munmap" \
$MNT/foo
blocks_used=$(stat -c %b $MNT/foo)
echo "blocks used: $blocks_used"
if [ $blocks_used -eq 0 ]; then
echo "ERROR: blocks used is 0"
fi
umount $DEV
$ ./reproducer.sh
blocks used: 0
ERROR: blocks used is 0
So move the logic that decides to set the 'delalloc bit' bit into the
function btrfs_set_extent_delalloc(), since that is what we use for all
those missing cases as well as for the cases that currently work well.
This change is also preparatory work for an upcoming patch that fixes
other problems related to tracking and reporting the number of bytes used
by an inode.
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:31 +00:00
|
|
|
next:
|
|
|
|
|
search_start = extent_map_end(em);
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:35 +03:00
|
|
|
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
|
2017-11-04 00:16:59 +00:00
|
|
|
unsigned int extra_bits,
|
2019-07-17 16:18:17 +03:00
|
|
|
struct extent_state **cached_state)
|
2008-08-04 23:17:27 -04:00
|
|
|
{
|
2018-12-05 15:23:04 +01:00
|
|
|
WARN_ON(PAGE_ALIGNED(end));
|
btrfs: fix missing delalloc new bit for new delalloc ranges
When doing a buffered write, through one of the write family syscalls, we
look for ranges which currently don't have allocated extents and set the
'delalloc new' bit on them, so that we can report a correct number of used
blocks to the stat(2) syscall until delalloc is flushed and ordered extents
complete.
However there are a few other places where we can do a buffered write
against a range that is mapped to a hole (no extent allocated) and where
we do not set the 'new delalloc' bit. Those places are:
- Doing a memory mapped write against a hole;
- Cloning an inline extent into a hole starting at file offset 0;
- Calling btrfs_cont_expand() when the i_size of the file is not aligned
to the sector size and is located in a hole. For example when cloning
to a destination offset beyond EOF.
So after such cases, until the corresponding delalloc range is flushed and
the respective ordered extents complete, we can report an incorrect number
of blocks used through the stat(2) syscall.
In some cases we can end up reporting 0 used blocks to stat(2), which is a
particular bad value to report as it may mislead tools to think a file is
completely sparse when its i_size is not zero, making them skip reading
any data, an undesired consequence for tools such as archivers and other
backup tools, as reported a long time ago in the following thread (and
other past threads):
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
Example reproducer:
$ cat reproducer.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -c "truncate 64K" \
-c "mmap -w 0 64K" \
-c "mwrite -S 0xab 0 64K" \
-c "munmap" \
$MNT/foo
blocks_used=$(stat -c %b $MNT/foo)
echo "blocks used: $blocks_used"
if [ $blocks_used -eq 0 ]; then
echo "ERROR: blocks used is 0"
fi
umount $DEV
$ ./reproducer.sh
blocks used: 0
ERROR: blocks used is 0
So move the logic that decides to set the 'delalloc bit' bit into the
function btrfs_set_extent_delalloc(), since that is what we use for all
those missing cases as well as for the cases that currently work well.
This change is also preparatory work for an upcoming patch that fixes
other problems related to tracking and reporting the number of bytes used
by an inode.
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:31 +00:00
|
|
|
|
|
|
|
|
if (start >= i_size_read(&inode->vfs_inode) &&
|
|
|
|
|
!(inode->flags & BTRFS_INODE_PREALLOC)) {
|
|
|
|
|
/*
|
|
|
|
|
* There can't be any extents following eof in this case so just
|
|
|
|
|
* set the delalloc new bit for the range directly.
|
|
|
|
|
*/
|
|
|
|
|
extra_bits |= EXTENT_DELALLOC_NEW;
|
|
|
|
|
} else {
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_find_new_delalloc_bytes(inode, start,
|
|
|
|
|
end + 1 - start,
|
|
|
|
|
cached_state);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:35 +03:00
|
|
|
return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
|
|
|
|
|
cached_state);
|
2008-08-04 23:17:27 -04:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/* see btrfs_writepage_start_hook for details on why this is required */
|
2008-07-17 12:53:51 -04:00
|
|
|
struct btrfs_writepage_fixup {
|
|
|
|
|
struct page *page;
|
2020-01-21 14:34:52 -05:00
|
|
|
struct inode *inode;
|
2008-07-17 12:53:51 -04:00
|
|
|
struct btrfs_work work;
|
|
|
|
|
};
|
|
|
|
|
|
2008-12-02 09:54:17 -05:00
|
|
|
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
|
2008-07-17 12:53:51 -04:00
|
|
|
{
|
|
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2017-02-27 15:10:38 +08:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2008-07-17 12:53:51 -04:00
|
|
|
struct page *page;
|
2020-06-05 10:51:51 +03:00
|
|
|
struct btrfs_inode *inode;
|
2008-07-17 12:53:51 -04:00
|
|
|
u64 page_start;
|
|
|
|
|
u64 page_end;
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
int ret = 0;
|
2020-01-21 14:34:52 -05:00
|
|
|
bool free_delalloc_space = true;
|
2008-07-17 12:53:51 -04:00
|
|
|
|
|
|
|
|
fixup = container_of(work, struct btrfs_writepage_fixup, work);
|
|
|
|
|
page = fixup->page;
|
2020-06-05 10:51:51 +03:00
|
|
|
inode = BTRFS_I(fixup->inode);
|
2020-01-21 14:34:52 -05:00
|
|
|
page_start = page_offset(page);
|
|
|
|
|
page_end = page_offset(page) + PAGE_SIZE - 1;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This is similar to page_mkwrite, we need to reserve the space before
|
|
|
|
|
* we take the page lock.
|
|
|
|
|
*/
|
2020-06-05 10:51:51 +03:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
|
|
|
|
|
PAGE_SIZE);
|
2008-07-21 10:29:44 -04:00
|
|
|
again:
|
2008-07-17 12:53:51 -04:00
|
|
|
lock_page(page);
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Before we queued this fixup, we took a reference on the page.
|
|
|
|
|
* page->mapping may go NULL, but it shouldn't be moved to a different
|
|
|
|
|
* address space.
|
|
|
|
|
*/
|
2020-01-21 14:34:52 -05:00
|
|
|
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
|
|
|
|
|
/*
|
|
|
|
|
* Unfortunately this is a little tricky, either
|
|
|
|
|
*
|
|
|
|
|
* 1) We got here and our page had already been dealt with and
|
|
|
|
|
* we reserved our space, thus ret == 0, so we need to just
|
|
|
|
|
* drop our space reservation and bail. This can happen the
|
|
|
|
|
* first time we come into the fixup worker, or could happen
|
|
|
|
|
* while waiting for the ordered extent.
|
|
|
|
|
* 2) Our page was already dealt with, but we happened to get an
|
|
|
|
|
* ENOSPC above from the btrfs_delalloc_reserve_space. In
|
|
|
|
|
* this case we obviously don't have anything to release, but
|
|
|
|
|
* because the page was already dealt with we don't want to
|
|
|
|
|
* mark the page with an error, so make sure we're resetting
|
|
|
|
|
* ret to 0. This is why we have this check _before_ the ret
|
|
|
|
|
* check, because we do not want to have a surprise ENOSPC
|
|
|
|
|
* when the page was already properly dealt with.
|
|
|
|
|
*/
|
|
|
|
|
if (!ret) {
|
2020-06-05 10:51:51 +03:00
|
|
|
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
|
|
|
|
|
btrfs_delalloc_release_space(inode, data_reserved,
|
2020-01-21 14:34:52 -05:00
|
|
|
page_start, PAGE_SIZE,
|
|
|
|
|
true);
|
|
|
|
|
}
|
|
|
|
|
ret = 0;
|
2008-07-17 12:53:51 -04:00
|
|
|
goto out_page;
|
2020-01-21 14:34:52 -05:00
|
|
|
}
|
2008-07-17 12:53:51 -04:00
|
|
|
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
/*
|
2020-01-21 14:34:52 -05:00
|
|
|
* We can't mess with the page state unless it is locked, so now that
|
|
|
|
|
* it is locked bail if we failed to make our space reservation.
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
*/
|
2020-01-21 14:34:52 -05:00
|
|
|
if (ret)
|
|
|
|
|
goto out_page;
|
2008-07-17 12:53:51 -04:00
|
|
|
|
2020-06-05 10:51:51 +03:00
|
|
|
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
|
2008-07-21 10:29:44 -04:00
|
|
|
|
|
|
|
|
/* already ordered? We're done */
|
2021-04-07 19:22:13 +08:00
|
|
|
if (PageOrdered(page))
|
2020-01-21 14:34:52 -05:00
|
|
|
goto out_reserved;
|
2008-07-21 10:29:44 -04:00
|
|
|
|
2020-06-05 10:51:51 +03:00
|
|
|
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
|
2008-07-21 10:29:44 -04:00
|
|
|
if (ordered) {
|
2020-06-05 10:51:51 +03:00
|
|
|
unlock_extent_cached(&inode->io_tree, page_start, page_end,
|
|
|
|
|
&cached_state);
|
2008-07-21 10:29:44 -04:00
|
|
|
unlock_page(page);
|
2020-09-18 12:15:53 +03:00
|
|
|
btrfs_start_ordered_extent(ordered, 1);
|
2012-02-15 16:23:57 +01:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
2008-07-21 10:29:44 -04:00
|
|
|
goto again;
|
|
|
|
|
}
|
2008-07-17 12:53:51 -04:00
|
|
|
|
2020-06-05 10:51:51 +03:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
|
2019-07-17 16:18:17 +03:00
|
|
|
&cached_state);
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
if (ret)
|
2019-10-09 17:43:59 +01:00
|
|
|
goto out_reserved;
|
2017-12-05 09:29:19 +02:00
|
|
|
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
/*
|
|
|
|
|
* Everything went as planned, we're now the owner of a dirty page with
|
|
|
|
|
* delayed allocation bits set and space reserved for our COW
|
|
|
|
|
* destination.
|
|
|
|
|
*
|
|
|
|
|
* The page was dirty when we started, nothing should have cleaned it.
|
|
|
|
|
*/
|
|
|
|
|
BUG_ON(!PageDirty(page));
|
2020-01-21 14:34:52 -05:00
|
|
|
free_delalloc_space = false;
|
2019-10-09 17:43:59 +01:00
|
|
|
out_reserved:
|
2020-06-05 10:51:51 +03:00
|
|
|
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
|
2020-01-21 14:34:52 -05:00
|
|
|
if (free_delalloc_space)
|
2020-06-05 10:51:51 +03:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved, page_start,
|
|
|
|
|
PAGE_SIZE, true);
|
|
|
|
|
unlock_extent_cached(&inode->io_tree, page_start, page_end,
|
2017-12-12 21:43:52 +01:00
|
|
|
&cached_state);
|
2008-07-17 12:53:51 -04:00
|
|
|
out_page:
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
if (ret) {
|
|
|
|
|
/*
|
|
|
|
|
* We hit ENOSPC or other errors. Update the mapping and page
|
|
|
|
|
* to reflect the errors and clean the page.
|
|
|
|
|
*/
|
|
|
|
|
mapping_set_error(page->mapping, ret);
|
|
|
|
|
end_extent_writepage(page, ret, page_start, page_end);
|
|
|
|
|
clear_page_dirty_for_io(page);
|
|
|
|
|
SetPageError(page);
|
|
|
|
|
}
|
2021-09-27 15:21:49 +08:00
|
|
|
btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
|
2008-07-17 12:53:51 -04:00
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
2011-01-26 16:19:22 +08:00
|
|
|
kfree(fixup);
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_free(data_reserved);
|
2020-01-21 14:34:52 -05:00
|
|
|
/*
|
|
|
|
|
* As a precaution, do a delayed iput in case it would be the last iput
|
|
|
|
|
* that could need flushing space. Recursing back to fixup worker would
|
|
|
|
|
* deadlock.
|
|
|
|
|
*/
|
2020-06-05 10:51:51 +03:00
|
|
|
btrfs_add_delayed_iput(&inode->vfs_inode);
|
2008-07-17 12:53:51 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* There are a few paths in the higher layers of the kernel that directly
|
|
|
|
|
* set the page dirty bit without asking the filesystem if it is a
|
|
|
|
|
* good idea. This causes problems because we want to make sure COW
|
|
|
|
|
* properly happens and the data=ordered rules are followed.
|
|
|
|
|
*
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
* In our case any range that doesn't have the ORDERED bit set
|
2008-07-17 12:53:51 -04:00
|
|
|
* hasn't been properly setup for IO. We kick off an async process
|
|
|
|
|
* to fix it up. The async helper will wait for ordered extents, set
|
|
|
|
|
* the delalloc bit and make it safe to write the page.
|
|
|
|
|
*/
|
2021-07-27 13:41:32 +08:00
|
|
|
int btrfs_writepage_cow_fixup(struct page *page)
|
2008-07-17 12:53:51 -04:00
|
|
|
{
|
|
|
|
|
struct inode *inode = page->mapping->host;
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-07-17 12:53:51 -04:00
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
|
|
2021-04-07 19:22:13 +08:00
|
|
|
/* This page has ordered extent covering it already */
|
|
|
|
|
if (PageOrdered(page))
|
2008-07-17 12:53:51 -04:00
|
|
|
return 0;
|
|
|
|
|
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
/*
|
|
|
|
|
* PageChecked is set below when we create a fixup worker for this page,
|
|
|
|
|
* don't try to create another one if we're already PageChecked()
|
|
|
|
|
*
|
|
|
|
|
* The extent_io writepage code will redirty the page if we send back
|
|
|
|
|
* EAGAIN.
|
|
|
|
|
*/
|
2008-07-17 12:53:51 -04:00
|
|
|
if (PageChecked(page))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
|
|
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
|
|
|
|
|
if (!fixup)
|
|
|
|
|
return -EAGAIN;
|
2008-07-22 11:18:09 -04:00
|
|
|
|
2020-01-21 14:34:52 -05:00
|
|
|
/*
|
|
|
|
|
* We are already holding a reference to this inode from
|
|
|
|
|
* write_cache_pages. We need to hold it because the space reservation
|
|
|
|
|
* takes place outside of the page lock, and we can't trust
|
|
|
|
|
* page->mapping outside of the page lock.
|
|
|
|
|
*/
|
|
|
|
|
ihold(inode);
|
2021-09-27 15:21:49 +08:00
|
|
|
btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
get_page(page);
|
2019-09-16 11:30:57 -07:00
|
|
|
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
|
2008-07-17 12:53:51 -04:00
|
|
|
fixup->page = page;
|
2020-01-21 14:34:52 -05:00
|
|
|
fixup->inode = inode;
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
|
Btrfs: keep pages dirty when using btrfs_writepage_fixup_worker
For COW, btrfs expects pages dirty pages to have been through a few setup
steps. This includes reserving space for the new block allocations and marking
the range in the state tree for delayed allocation.
A few places outside btrfs will dirty pages directly, especially when unmapping
mmap'd pages. In order for these to properly go through COW, we run them
through a fixup worker to wait for stable pages, and do the delalloc prep.
87826df0ec36 added a window where the dirty pages were cleaned, but pending
more action from the fixup worker. We clear_page_dirty_for_io() before
we call into writepage, so the page is no longer dirty. The commit
changed it so now we leave the page clean between unlocking it here and
the fixup worker starting at some point in the future.
During this window, page migration can jump in and relocate the page. Once our
fixup work actually starts, it finds page->mapping is NULL and we end up
freeing the page without ever writing it.
This leads to crc errors and other exciting problems, since it screws up the
whole statemachine for waiting for ordered extents. The fix here is to keep
the page dirty while we're waiting for the fixup worker to get to work.
This is accomplished by returning -EAGAIN from btrfs_writepage_cow_fixup
if we queued the page up for fixup, which will cause the writepage
function to redirty the page.
Because we now expect the page to be dirty once it gets to the fixup
worker we must adjust the error cases to call clear_page_dirty_for_io()
on the page. That is the bulk of the patch, but it is not the fix, the
fix is the -EAGAIN from btrfs_writepage_cow_fixup. We cannot separate
these two changes out because the error conditions change with the new
expectations.
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-21 11:51:42 -05:00
|
|
|
|
|
|
|
|
return -EAGAIN;
|
2008-07-17 12:53:51 -04:00
|
|
|
}
|
|
|
|
|
|
2008-10-30 14:25:28 -04:00
|
|
|
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
|
2020-06-03 08:55:19 +03:00
|
|
|
struct btrfs_inode *inode, u64 file_pos,
|
2020-06-10 09:04:41 +08:00
|
|
|
struct btrfs_file_extent_item *stack_fi,
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
const bool update_inode_bytes,
|
2020-06-10 09:04:41 +08:00
|
|
|
u64 qgroup_reserved)
|
2008-10-30 14:25:28 -04:00
|
|
|
{
|
2020-06-03 08:55:19 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
const u64 sectorsize = root->fs_info->sectorsize;
|
2008-10-30 14:25:28 -04:00
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct btrfs_key ins;
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
|
|
|
|
|
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
|
2019-11-06 12:11:56 -08:00
|
|
|
u64 offset = btrfs_stack_file_extent_offset(stack_fi);
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
|
|
|
|
|
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
|
2020-11-04 11:07:32 +00:00
|
|
|
struct btrfs_drop_extents_args drop_args = { 0 };
|
2008-10-30 14:25:28 -04:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 10:38:47 -07:00
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
2008-10-30 14:25:28 -04:00
|
|
|
|
2009-09-11 12:27:37 -04:00
|
|
|
/*
|
|
|
|
|
* we may be replacing one extent in the tree with another.
|
|
|
|
|
* The new extent is pinned in the extent map, and we don't want
|
|
|
|
|
* to drop it from the cache until it is completely in the btree.
|
|
|
|
|
*
|
|
|
|
|
* So, tell btrfs_drop_extents to leave this extent in the cache.
|
|
|
|
|
* the caller is expected to unpin it and allow it to be merged
|
|
|
|
|
* with the others.
|
|
|
|
|
*/
|
2020-11-04 11:07:32 +00:00
|
|
|
drop_args.path = path;
|
|
|
|
|
drop_args.start = file_pos;
|
|
|
|
|
drop_args.end = file_pos + num_bytes;
|
|
|
|
|
drop_args.replace_extent = true;
|
|
|
|
|
drop_args.extent_item_size = sizeof(*stack_fi);
|
|
|
|
|
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
2008-10-30 14:25:28 -04:00
|
|
|
|
2020-11-04 11:07:32 +00:00
|
|
|
if (!drop_args.extent_inserted) {
|
2020-06-03 08:55:19 +03:00
|
|
|
ins.objectid = btrfs_ino(inode);
|
2014-01-07 11:42:27 +00:00
|
|
|
ins.offset = file_pos;
|
|
|
|
|
ins.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &ins,
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
sizeof(*stack_fi));
|
2014-01-07 11:42:27 +00:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2008-10-30 14:25:28 -04:00
|
|
|
leaf = path->nodes[0];
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
|
|
|
|
|
write_extent_buffer(leaf, stack_fi,
|
|
|
|
|
btrfs_item_ptr_offset(leaf, path->slots[0]),
|
|
|
|
|
sizeof(struct btrfs_file_extent_item));
|
2009-03-13 11:00:37 -04:00
|
|
|
|
2008-10-30 14:25:28 -04:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2012-09-25 15:26:16 -04:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 14:25:28 -04:00
|
|
|
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
/*
|
|
|
|
|
* If we dropped an inline extent here, we know the range where it is
|
|
|
|
|
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
|
2021-05-21 17:42:23 +02:00
|
|
|
* number of bytes only for that range containing the inline extent.
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
* The remaining of the range will be processed when clearning the
|
|
|
|
|
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
|
|
|
|
|
*/
|
|
|
|
|
if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
|
|
|
|
|
u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
|
|
|
|
|
|
|
|
|
|
inline_size = drop_args.bytes_found - inline_size;
|
|
|
|
|
btrfs_update_inode_bytes(inode, sectorsize, inline_size);
|
|
|
|
|
drop_args.bytes_found -= inline_size;
|
|
|
|
|
num_bytes -= sectorsize;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (update_inode_bytes)
|
|
|
|
|
btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
|
2008-10-30 14:25:28 -04:00
|
|
|
|
|
|
|
|
ins.objectid = disk_bytenr;
|
|
|
|
|
ins.offset = disk_num_bytes;
|
|
|
|
|
ins.type = BTRFS_EXTENT_ITEM_KEY;
|
btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quotas being enabled
[BUG]
Under the following case, we can underflow qgroup reserved space.
Task A | Task B
---------------------------------------------------------------
Quota disabled |
Buffered write |
|- btrfs_check_data_free_space() |
| *NO* qgroup space is reserved |
| since quota is *DISABLED* |
|- All pages are copied to page |
cache |
| Enable quota
| Quota scan finished
|
| Sync_fs
| |- run_delalloc_range
| |- Write pages
| |- btrfs_finish_ordered_io
| |- insert_reserved_file_extent
| |- btrfs_qgroup_release_data()
| Since no qgroup space is
reserved in Task A, we
underflow qgroup reserved
space
This can be detected by fstest btrfs/104.
[CAUSE]
In insert_reserved_file_extent() we tell qgroup to release the @ram_bytes
size of qgroup reserved_space in all cases.
And btrfs_qgroup_release_data() will check if quotas are enabled.
However in the above case, the buffered write happens before quota is
enabled, so we don't have the reserved space for that range.
[FIX]
In insert_reserved_file_extent(), we tell qgroup to release the acctual
byte number it released.
In the above case, since we don't have the reserved space, we tell
qgroups to release 0 byte, so the problem can be fixed.
And thanks to the @reserved parameter introduced by the qgroup rework,
and previous patch to return released bytes, the fix can be as small as
10 lines.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
[ changelog updates ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:37 +08:00
|
|
|
|
2020-06-03 08:55:19 +03:00
|
|
|
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
|
2020-01-17 09:02:22 -05:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
|
2020-06-03 08:55:19 +03:00
|
|
|
ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
|
2019-11-06 12:11:56 -08:00
|
|
|
file_pos - offset,
|
|
|
|
|
qgroup_reserved, &ins);
|
2012-03-12 16:03:00 +01:00
|
|
|
out:
|
2008-10-30 14:25:28 -04:00
|
|
|
btrfs_free_path(path);
|
2009-03-13 11:00:37 -04:00
|
|
|
|
2012-03-12 16:03:00 +01:00
|
|
|
return ret;
|
2008-10-30 14:25:28 -04:00
|
|
|
}
|
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
u64 start, u64 len)
|
|
|
|
|
{
|
2019-10-29 19:20:18 +01:00
|
|
|
struct btrfs_block_group *cache;
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, start);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
ASSERT(cache);
|
|
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
|
cache->delalloc_bytes -= len;
|
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
|
struct btrfs_ordered_extent *oe)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_file_extent_item stack_fi;
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
bool update_inode_bytes;
|
2019-11-06 12:11:56 -08:00
|
|
|
u64 num_bytes = oe->num_bytes;
|
|
|
|
|
u64 ram_bytes = oe->ram_bytes;
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
|
|
|
|
|
memset(&stack_fi, 0, sizeof(stack_fi));
|
|
|
|
|
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
|
|
|
|
|
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
|
|
|
|
|
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
|
|
|
|
|
oe->disk_num_bytes);
|
2019-11-06 12:11:56 -08:00
|
|
|
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
|
2022-06-21 18:40:48 +02:00
|
|
|
if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
|
|
|
|
|
num_bytes = oe->truncated_len;
|
|
|
|
|
ram_bytes = num_bytes;
|
|
|
|
|
}
|
2019-11-06 12:11:56 -08:00
|
|
|
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
|
|
|
|
|
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
|
|
|
|
|
/* Encryption and other encoding is reserved and all 0 */
|
|
|
|
|
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
/*
|
|
|
|
|
* For delalloc, when completing an ordered extent we update the inode's
|
|
|
|
|
* bytes when clearing the range in the inode's io tree, so pass false
|
|
|
|
|
* as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
|
|
|
|
|
* except if the ordered extent was truncated.
|
|
|
|
|
*/
|
|
|
|
|
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
|
2019-08-13 16:00:02 -07:00
|
|
|
test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
|
|
|
|
|
|
2020-09-18 12:15:51 +03:00
|
|
|
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
|
|
|
|
|
oe->file_offset, &stack_fi,
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
update_inode_bytes, oe->qgroup_rsv);
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* As ordered data IO finishes, this gets called so we can finish
|
2008-09-29 15:18:18 -04:00
|
|
|
* an ordered extent if the range of bytes in the file it covers are
|
|
|
|
|
* fully written.
|
|
|
|
|
*/
|
2022-06-19 08:07:05 +02:00
|
|
|
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
|
2008-07-17 12:53:50 -04:00
|
|
|
{
|
2020-11-02 16:48:56 +02:00
|
|
|
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2010-05-16 10:48:47 -04:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2020-11-02 16:48:56 +02:00
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2019-12-02 17:34:19 -08:00
|
|
|
u64 start, end;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type = 0;
|
2013-08-29 13:57:21 -04:00
|
|
|
int ret = 0;
|
2019-12-02 17:34:19 -08:00
|
|
|
u64 logical_len = ordered_extent->num_bytes;
|
2019-10-08 20:43:06 +03:00
|
|
|
bool freespace_inode;
|
2013-08-29 13:57:21 -04:00
|
|
|
bool truncated = false;
|
2018-10-11 15:54:21 -04:00
|
|
|
bool clear_reserved_extent = true;
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
unsigned int clear_bits = EXTENT_DEFRAG;
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
|
2019-12-02 17:34:19 -08:00
|
|
|
start = ordered_extent->file_offset;
|
|
|
|
|
end = start + ordered_extent->num_bytes - 1;
|
|
|
|
|
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
|
|
|
|
|
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
|
2019-08-13 16:00:02 -07:00
|
|
|
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
|
|
|
|
|
!test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
clear_bits |= EXTENT_DELALLOC_NEW;
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2020-11-02 16:48:56 +02:00
|
|
|
freespace_inode = btrfs_is_free_space_inode(inode);
|
2022-07-25 15:11:59 -07:00
|
|
|
if (!freespace_inode)
|
|
|
|
|
btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
|
2010-07-02 12:14:14 -04:00
|
|
|
|
2012-05-02 14:00:54 -04:00
|
|
|
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
|
|
|
|
|
ret = -EIO;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-19 21:19:23 +09:00
|
|
|
/* A valid bdev implies a write on a sequential zone */
|
|
|
|
|
if (ordered_extent->bdev) {
|
2021-02-04 19:22:05 +09:00
|
|
|
btrfs_rewrite_logical_zoned(ordered_extent);
|
2021-08-19 21:19:23 +09:00
|
|
|
btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
|
|
|
|
|
ordered_extent->disk_num_bytes);
|
|
|
|
|
}
|
2021-02-04 19:22:05 +09:00
|
|
|
|
2020-11-02 16:48:56 +02:00
|
|
|
btrfs_free_io_failure_record(inode, start, end);
|
Btrfs: cleanup the read failure record after write or when the inode is freeing
After the data is written successfully, we should cleanup the read failure record
in that range because
- If we set data COW for the file, the range that the failure record pointed to is
mapped to a new place, so it is invalid.
- If we set no data COW for the file, and if there is no error during writting,
the corrupted data is corrected, so the failure record can be removed. And if
some errors happen on the mirrors, we also needn't worry about it because the
failure record will be recreated if we read the same place again.
Sometimes, we may fail to correct the data, so the failure records will be left
in the tree, we need free them when we free the inode or the memory leak happens.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-12 18:44:04 +08:00
|
|
|
|
2013-08-29 13:57:21 -04:00
|
|
|
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
|
|
|
|
|
truncated = true;
|
|
|
|
|
logical_len = ordered_extent->truncated_len;
|
|
|
|
|
/* Truncated the entire extent, don't bother adding */
|
|
|
|
|
if (!logical_len)
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2009-11-12 09:34:21 +00:00
|
|
|
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
|
2012-03-12 16:03:00 +01:00
|
|
|
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
|
2015-09-08 17:25:56 +08:00
|
|
|
|
2020-11-02 16:48:56 +02:00
|
|
|
btrfs_inode_safe_disk_i_size_write(inode, 0);
|
2019-10-08 20:43:06 +03:00
|
|
|
if (freespace_inode)
|
|
|
|
|
trans = btrfs_join_transaction_spacecache(root);
|
2012-11-09 10:53:21 -05:00
|
|
|
else
|
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
trans = NULL;
|
|
|
|
|
goto out;
|
2009-11-12 09:34:21 +00:00
|
|
|
}
|
2020-11-02 16:48:56 +02:00
|
|
|
trans->block_rsv = &inode->block_rsv;
|
2020-11-02 16:49:06 +02:00
|
|
|
ret = btrfs_update_inode_fallback(trans, root, inode);
|
2012-11-09 10:53:21 -05:00
|
|
|
if (ret) /* -ENOMEM or corruption */
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2009-11-12 09:34:21 +00:00
|
|
|
goto out;
|
|
|
|
|
}
|
2008-07-17 12:53:50 -04:00
|
|
|
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
clear_bits |= EXTENT_LOCKED;
|
2019-12-02 17:34:19 -08:00
|
|
|
lock_extent_bits(io_tree, start, end, &cached_state);
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2019-10-08 20:43:06 +03:00
|
|
|
if (freespace_inode)
|
|
|
|
|
trans = btrfs_join_transaction_spacecache(root);
|
2010-07-02 12:14:14 -04:00
|
|
|
else
|
2011-04-13 12:54:33 -04:00
|
|
|
trans = btrfs_join_transaction(root);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
trans = NULL;
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
goto out;
|
2012-03-12 16:03:00 +01:00
|
|
|
}
|
2014-05-22 16:18:52 -07:00
|
|
|
|
2020-11-02 16:48:56 +02:00
|
|
|
trans->block_rsv = &inode->block_rsv;
|
2009-11-12 09:34:21 +00:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type = ordered_extent->compress_type;
|
2008-10-30 14:25:28 -04:00
|
|
|
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
|
2010-12-17 14:21:50 +08:00
|
|
|
BUG_ON(compress_type);
|
2020-11-02 16:48:56 +02:00
|
|
|
ret = btrfs_mark_extent_written(trans, inode,
|
2008-10-30 14:25:28 -04:00
|
|
|
ordered_extent->file_offset,
|
|
|
|
|
ordered_extent->file_offset +
|
2013-08-29 13:57:21 -04:00
|
|
|
logical_len);
|
btrfs: zoned: prevent allocation from previous data relocation BG
After commit 5f0addf7b890 ("btrfs: zoned: use dedicated lock for data
relocation"), we observe IO errors on e.g, btrfs/232 like below.
[09.0][T4038707] WARNING: CPU: 3 PID: 4038707 at fs/btrfs/extent-tree.c:2381 btrfs_cross_ref_exist+0xfc/0x120 [btrfs]
<snip>
[09.9][T4038707] Call Trace:
[09.5][T4038707] <TASK>
[09.3][T4038707] run_delalloc_nocow+0x7f1/0x11a0 [btrfs]
[09.6][T4038707] ? test_range_bit+0x174/0x320 [btrfs]
[09.2][T4038707] ? fallback_to_cow+0x980/0x980 [btrfs]
[09.3][T4038707] ? find_lock_delalloc_range+0x33e/0x3e0 [btrfs]
[09.5][T4038707] btrfs_run_delalloc_range+0x445/0x1320 [btrfs]
[09.2][T4038707] ? test_range_bit+0x320/0x320 [btrfs]
[09.4][T4038707] ? lock_downgrade+0x6a0/0x6a0
[09.2][T4038707] ? orc_find.part.0+0x1ed/0x300
[09.5][T4038707] ? __module_address.part.0+0x25/0x300
[09.0][T4038707] writepage_delalloc+0x159/0x310 [btrfs]
<snip>
[09.4][ C3] sd 10:0:1:0: [sde] tag#2620 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
[09.5][ C3] sd 10:0:1:0: [sde] tag#2620 Sense Key : Illegal Request [current]
[09.9][ C3] sd 10:0:1:0: [sde] tag#2620 Add. Sense: Unaligned write command
[09.5][ C3] sd 10:0:1:0: [sde] tag#2620 CDB: Write(16) 8a 00 00 00 00 00 02 f3 63 87 00 00 00 2c 00 00
[09.4][ C3] critical target error, dev sde, sector 396041272 op 0x1:(WRITE) flags 0x800 phys_seg 3 prio class 0
[09.9][ C3] BTRFS error (device dm-1): bdev /dev/mapper/dml_102_2 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0
The IO errors occur when we allocate a regular extent in previous data
relocation block group.
On zoned btrfs, we use a dedicated block group to relocate a data
extent. Thus, we allocate relocating data extents (pre-alloc) only from
the dedicated block group and vice versa. Once the free space in the
dedicated block group gets tight, a relocating extent may not fit into
the block group. In that case, we need to switch the dedicated block
group to the next one. Then, the previous one is now freed up for
allocating a regular extent. The BG is already not enough to allocate
the relocating extent, but there is still room to allocate a smaller
extent. Now the problem happens. By allocating a regular extent while
nocow IOs for the relocation is still on-going, we will issue WRITE IOs
(for relocation) and ZONE APPEND IOs (for the regular writes) at the
same time. That mixed IOs confuses the write pointer and arises the
unaligned write errors.
This commit introduces a new bit 'zoned_data_reloc_ongoing' to the
btrfs_block_group. We set this bit before releasing the dedicated block
group, and no extent are allocated from a block group having this bit
set. This bit is similar to setting block_group->ro, but is different from
it by allowing nocow writes to start.
Once all the nocow IO for relocation is done (hooked from
btrfs_finish_ordered_io), we reset the bit to release the block group for
further allocation.
Fixes: c2707a255623 ("btrfs: zoned: add a dedicated data relocation block group")
CC: stable@vger.kernel.org # 5.16+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-07 16:08:29 +09:00
|
|
|
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
|
|
|
|
|
ordered_extent->disk_num_bytes);
|
2008-10-30 14:25:28 -04:00
|
|
|
} else {
|
2016-06-22 18:54:23 -04:00
|
|
|
BUG_ON(root == fs_info->tree_root);
|
2020-09-18 12:15:51 +03:00
|
|
|
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
|
2018-10-11 15:54:21 -04:00
|
|
|
if (!ret) {
|
|
|
|
|
clear_reserved_extent = false;
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_release_delalloc_bytes(fs_info,
|
2019-12-02 17:34:19 -08:00
|
|
|
ordered_extent->disk_bytenr,
|
|
|
|
|
ordered_extent->disk_num_bytes);
|
2018-10-11 15:54:21 -04:00
|
|
|
}
|
2008-10-30 14:25:28 -04:00
|
|
|
}
|
2020-11-02 16:48:56 +02:00
|
|
|
unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
|
2019-12-02 17:34:19 -08:00
|
|
|
ordered_extent->num_bytes, trans->transid);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret < 0) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
goto out;
|
2012-03-12 16:03:00 +01:00
|
|
|
}
|
2010-02-03 19:33:23 +00:00
|
|
|
|
2020-09-18 12:15:52 +03:00
|
|
|
ret = add_pending_csums(trans, &ordered_extent->list);
|
2018-01-08 10:59:43 +02:00
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2008-07-17 12:53:50 -04:00
|
|
|
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
/*
|
|
|
|
|
* If this is a new delalloc range, clear its new delalloc flag to
|
|
|
|
|
* update the inode's number of bytes. This needs to be done first
|
|
|
|
|
* before updating the inode item.
|
|
|
|
|
*/
|
|
|
|
|
if ((clear_bits & EXTENT_DELALLOC_NEW) &&
|
|
|
|
|
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
|
2020-11-02 16:48:56 +02:00
|
|
|
clear_extent_bit(&inode->io_tree, start, end,
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
|
|
|
|
|
0, 0, &cached_state);
|
|
|
|
|
|
2020-11-02 16:48:56 +02:00
|
|
|
btrfs_inode_safe_disk_i_size_write(inode, 0);
|
2020-11-02 16:49:06 +02:00
|
|
|
ret = btrfs_update_inode_fallback(trans, root, inode);
|
2012-11-09 10:53:21 -05:00
|
|
|
if (ret) { /* -ENOMEM or corruption */
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
goto out;
|
2011-04-05 19:25:36 -04:00
|
|
|
}
|
|
|
|
|
ret = 0;
|
2009-11-12 09:34:21 +00:00
|
|
|
out:
|
2020-11-02 16:48:56 +02:00
|
|
|
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
|
2019-12-02 17:34:19 -08:00
|
|
|
(clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
|
2019-12-02 17:34:18 -08:00
|
|
|
&cached_state);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
|
2012-09-20 01:51:59 -06:00
|
|
|
if (trans)
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2010-07-02 12:14:14 -04:00
|
|
|
|
2013-08-29 13:57:21 -04:00
|
|
|
if (ret || truncated) {
|
2019-12-02 17:34:19 -08:00
|
|
|
u64 unwritten_start = start;
|
2013-08-29 13:57:21 -04:00
|
|
|
|
2021-05-19 09:38:27 -04:00
|
|
|
/*
|
|
|
|
|
* If we failed to finish this ordered extent for any reason we
|
|
|
|
|
* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
|
|
|
|
|
* extent, and mark the inode with the error if it wasn't
|
|
|
|
|
* already set. Any error during writeback would have already
|
|
|
|
|
* set the mapping error, so we need to set it if we're the ones
|
|
|
|
|
* marking this ordered extent as failed.
|
|
|
|
|
*/
|
|
|
|
|
if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
|
|
|
|
|
&ordered_extent->flags))
|
|
|
|
|
mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
|
|
|
|
|
|
2013-08-29 13:57:21 -04:00
|
|
|
if (truncated)
|
2019-12-02 17:34:19 -08:00
|
|
|
unwritten_start += logical_len;
|
|
|
|
|
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
|
2013-08-29 13:57:21 -04:00
|
|
|
|
|
|
|
|
/* Drop the cache for the part of the extent we didn't write. */
|
2020-11-02 16:48:56 +02:00
|
|
|
btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
|
2012-05-02 14:00:54 -04:00
|
|
|
|
2013-01-31 14:58:00 -05:00
|
|
|
/*
|
|
|
|
|
* If the ordered extent had an IOERR or something else went
|
|
|
|
|
* wrong we need to return the space for this ordered extent
|
2013-08-29 13:57:21 -04:00
|
|
|
* back to the allocator. We only free the extent in the
|
|
|
|
|
* truncated case if we didn't write out the extent at all.
|
2018-10-11 15:54:21 -04:00
|
|
|
*
|
|
|
|
|
* If we made it past insert_reserved_file_extent before we
|
|
|
|
|
* errored out then we don't need to do this as the accounting
|
|
|
|
|
* has already been done.
|
2013-01-31 14:58:00 -05:00
|
|
|
*/
|
2013-08-29 13:57:21 -04:00
|
|
|
if ((ret || !logical_len) &&
|
2018-10-11 15:54:21 -04:00
|
|
|
clear_reserved_extent &&
|
2013-08-29 13:57:21 -04:00
|
|
|
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
|
2019-11-21 14:03:29 +02:00
|
|
|
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
|
|
|
|
|
/*
|
|
|
|
|
* Discard the range before returning it back to the
|
|
|
|
|
* free space pool
|
|
|
|
|
*/
|
2019-12-13 16:22:11 -08:00
|
|
|
if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
|
2019-11-21 14:03:29 +02:00
|
|
|
btrfs_discard_extent(fs_info,
|
2019-12-02 17:34:19 -08:00
|
|
|
ordered_extent->disk_bytenr,
|
|
|
|
|
ordered_extent->disk_num_bytes,
|
|
|
|
|
NULL);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_free_reserved_extent(fs_info,
|
2019-12-02 17:34:19 -08:00
|
|
|
ordered_extent->disk_bytenr,
|
|
|
|
|
ordered_extent->disk_num_bytes, 1);
|
2019-11-21 14:03:29 +02:00
|
|
|
}
|
2013-01-31 14:58:00 -05:00
|
|
|
}
|
|
|
|
|
|
2012-05-02 14:00:54 -04:00
|
|
|
/*
|
2012-06-18 12:14:23 +08:00
|
|
|
* This needs to be done to make sure anybody waiting knows we are done
|
|
|
|
|
* updating everything for this ordered extent.
|
2012-05-02 14:00:54 -04:00
|
|
|
*/
|
2020-11-02 16:48:56 +02:00
|
|
|
btrfs_remove_ordered_extent(inode, ordered_extent);
|
2012-05-02 14:00:54 -04:00
|
|
|
|
2008-07-17 12:53:50 -04:00
|
|
|
/* once for us */
|
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
|
/* once for the tree */
|
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
|
|
2012-05-02 14:00:54 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-08 20:32:27 +08:00
|
|
|
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
|
|
|
|
|
struct page *page, u64 start,
|
2021-07-26 14:15:08 +02:00
|
|
|
u64 end, bool uptodate)
|
2008-07-18 11:56:15 -04:00
|
|
|
{
|
2021-04-08 20:32:27 +08:00
|
|
|
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2022-06-19 08:07:05 +02:00
|
|
|
btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
|
2008-07-18 11:56:15 -04:00
|
|
|
}
|
|
|
|
|
|
btrfs: introduce a data checksum checking helper
Although we have several data csum verification code, we never have a
function really just to verify checksum for one sector.
Function check_data_csum() do extra work for error reporting, thus it
requires a lot of extra things like file offset, bio_offset etc.
Function btrfs_verify_data_csum() is even worse, it will utilize page
checked flag, which means it can not be utilized for direct IO pages.
Here we introduce a new helper, btrfs_check_sector_csum(), which really
only accept a sector in page, and expected checksum pointer.
We use this function to implement check_data_csum(), and export it for
incoming patch.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
[hch: keep passing the csum array as an arguments, as the callers want
to print it, rename per request]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-05-22 13:47:48 +02:00
|
|
|
/*
|
|
|
|
|
* Verify the checksum for a single sector without any extra action that depend
|
|
|
|
|
* on the type of I/O.
|
|
|
|
|
*/
|
|
|
|
|
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
|
|
|
|
|
u32 pgoff, u8 *csum, const u8 * const csum_expected)
|
|
|
|
|
{
|
|
|
|
|
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
|
|
|
|
|
char *kaddr;
|
|
|
|
|
|
|
|
|
|
ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
|
|
|
|
|
|
|
|
|
|
shash->tfm = fs_info->csum_shash;
|
|
|
|
|
|
|
|
|
|
kaddr = kmap_local_page(page) + pgoff;
|
|
|
|
|
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
|
|
|
|
|
kunmap_local(kaddr);
|
|
|
|
|
|
|
|
|
|
if (memcmp(csum, csum_expected, fs_info->csum_size))
|
|
|
|
|
return -EIO;
|
|
|
|
|
return 0;
|
2008-07-18 11:56:15 -04:00
|
|
|
}
|
|
|
|
|
|
2020-10-21 14:24:54 +08:00
|
|
|
/*
|
|
|
|
|
* check_data_csum - verify checksum of one sector of uncompressed data
|
2020-12-02 14:47:58 +08:00
|
|
|
* @inode: inode
|
2022-07-07 07:33:29 +02:00
|
|
|
* @bbio: btrfs_bio which contains the csum
|
2020-12-02 14:47:58 +08:00
|
|
|
* @bio_offset: offset to the beginning of the bio (in bytes)
|
2020-10-21 14:24:54 +08:00
|
|
|
* @page: page where is the data to be verified
|
|
|
|
|
* @pgoff: offset inside the page
|
|
|
|
|
*
|
|
|
|
|
* The length of such check is always one sector size.
|
btrfs: introduce a data checksum checking helper
Although we have several data csum verification code, we never have a
function really just to verify checksum for one sector.
Function check_data_csum() do extra work for error reporting, thus it
requires a lot of extra things like file offset, bio_offset etc.
Function btrfs_verify_data_csum() is even worse, it will utilize page
checked flag, which means it can not be utilized for direct IO pages.
Here we introduce a new helper, btrfs_check_sector_csum(), which really
only accept a sector in page, and expected checksum pointer.
We use this function to implement check_data_csum(), and export it for
incoming patch.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
[hch: keep passing the csum array as an arguments, as the callers want
to print it, rename per request]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-05-22 13:47:48 +02:00
|
|
|
*
|
|
|
|
|
* When csum mismatch is detected, we will also report the error and fill the
|
|
|
|
|
* corrupted range with zero. (Thus it needs the extra parameters)
|
2020-10-21 14:24:54 +08:00
|
|
|
*/
|
2022-07-07 07:33:29 +02:00
|
|
|
int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
|
|
|
|
|
u32 bio_offset, struct page *page, u32 pgoff)
|
2014-09-12 18:43:55 +08:00
|
|
|
{
|
2019-06-03 16:58:57 +02:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2020-10-21 14:24:54 +08:00
|
|
|
u32 len = fs_info->sectorsize;
|
2019-06-03 16:58:57 +02:00
|
|
|
u8 *csum_expected;
|
|
|
|
|
u8 csum[BTRFS_CSUM_SIZE];
|
2014-09-12 18:43:55 +08:00
|
|
|
|
2020-10-21 14:24:54 +08:00
|
|
|
ASSERT(pgoff + len <= PAGE_SIZE);
|
|
|
|
|
|
2022-05-22 13:47:52 +02:00
|
|
|
csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
|
2019-06-03 16:58:57 +02:00
|
|
|
|
btrfs: introduce a data checksum checking helper
Although we have several data csum verification code, we never have a
function really just to verify checksum for one sector.
Function check_data_csum() do extra work for error reporting, thus it
requires a lot of extra things like file offset, bio_offset etc.
Function btrfs_verify_data_csum() is even worse, it will utilize page
checked flag, which means it can not be utilized for direct IO pages.
Here we introduce a new helper, btrfs_check_sector_csum(), which really
only accept a sector in page, and expected checksum pointer.
We use this function to implement check_data_csum(), and export it for
incoming patch.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
[hch: keep passing the csum array as an arguments, as the callers want
to print it, rename per request]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-05-22 13:47:48 +02:00
|
|
|
if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
|
2014-09-12 18:43:55 +08:00
|
|
|
goto zeroit;
|
|
|
|
|
return 0;
|
btrfs: introduce a data checksum checking helper
Although we have several data csum verification code, we never have a
function really just to verify checksum for one sector.
Function check_data_csum() do extra work for error reporting, thus it
requires a lot of extra things like file offset, bio_offset etc.
Function btrfs_verify_data_csum() is even worse, it will utilize page
checked flag, which means it can not be utilized for direct IO pages.
Here we introduce a new helper, btrfs_check_sector_csum(), which really
only accept a sector in page, and expected checksum pointer.
We use this function to implement check_data_csum(), and export it for
incoming patch.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
[hch: keep passing the csum array as an arguments, as the callers want
to print it, rename per request]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-05-22 13:47:48 +02:00
|
|
|
|
2014-09-12 18:43:55 +08:00
|
|
|
zeroit:
|
2022-07-07 07:33:29 +02:00
|
|
|
btrfs_print_data_csum_error(BTRFS_I(inode),
|
|
|
|
|
bbio->file_offset + bio_offset,
|
|
|
|
|
csum, csum_expected, bbio->mirror_num);
|
2021-09-15 15:17:18 +08:00
|
|
|
if (bbio->device)
|
|
|
|
|
btrfs_dev_stat_inc_and_print(bbio->device,
|
2020-07-02 15:23:32 +03:00
|
|
|
BTRFS_DEV_STAT_CORRUPTION_ERRS);
|
2022-03-25 17:37:59 +08:00
|
|
|
memzero_page(page, pgoff, len);
|
2014-09-12 18:43:55 +08:00
|
|
|
return -EIO;
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
2020-12-02 14:47:58 +08:00
|
|
|
* When reads are done, we need to check csums to verify the data is correct.
|
2011-07-22 15:41:52 +02:00
|
|
|
* if there's a match, we allow the bio to finish. If not, the code in
|
|
|
|
|
* extent_io.c will try to find good copies for us.
|
2020-12-02 14:47:58 +08:00
|
|
|
*
|
|
|
|
|
* @bio_offset: offset to the beginning of the bio (in bytes)
|
|
|
|
|
* @start: file offset of the range start
|
|
|
|
|
* @end: file offset of the range end (inclusive)
|
btrfs: make btrfs_verify_data_csum() to return a bitmap
This will provide the basis for later per-sector repair for subpage,
while still keeping the existing code happy.
As if all csums match, the return value will be 0, same as now.
Only when csum mismatches, the return value is different.
The new return value will be a bitmap, for 4K sectorsize and 4K page
size, it will be either 1, instead of the -EIO (which is not used
directly by the callers, no effective change).
But for 4K sectorsize and 64K page size, aka subpage case, since the
bvec can contain multiple sectors, knowing which sectors are corrupted
will allow us to submit repair only for corrupted sectors.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-05-03 10:08:54 +08:00
|
|
|
*
|
|
|
|
|
* Return a bitmap where bit set means a csum mismatch, and bit not set means
|
|
|
|
|
* csum match.
|
2008-09-29 15:18:18 -04:00
|
|
|
*/
|
2021-09-15 15:17:18 +08:00
|
|
|
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
|
|
|
|
|
u32 bio_offset, struct page *page,
|
|
|
|
|
u64 start, u64 end)
|
2007-08-30 08:50:51 -04:00
|
|
|
{
|
|
|
|
|
struct inode *inode = page->mapping->host;
|
2021-09-27 15:21:49 +08:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-01-24 16:13:08 -05:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2007-10-15 16:22:25 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2020-12-02 14:47:59 +08:00
|
|
|
const u32 sectorsize = root->fs_info->sectorsize;
|
|
|
|
|
u32 pg_off;
|
btrfs: make btrfs_verify_data_csum() to return a bitmap
This will provide the basis for later per-sector repair for subpage,
while still keeping the existing code happy.
As if all csums match, the return value will be 0, same as now.
Only when csum mismatches, the return value is different.
The new return value will be a bitmap, for 4K sectorsize and 4K page
size, it will be either 1, instead of the -EIO (which is not used
directly by the callers, no effective change).
But for 4K sectorsize and 64K page size, aka subpage case, since the
bvec can contain multiple sectors, knowing which sectors are corrupted
will allow us to submit repair only for corrupted sectors.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-05-03 10:08:54 +08:00
|
|
|
unsigned int result = 0;
|
2008-01-24 16:13:08 -05:00
|
|
|
|
btrfs: subpage: check if there are compressed extents inside one page
[BUG]
When testing experimental subpage compressed write support, it hits a
NULL pointer dereference inside read path:
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000018
pc : __pi_memcmp+0x28/0x1ec
lr : check_data_csum+0xd0/0x274 [btrfs]
Call trace:
__pi_memcmp+0x28/0x1ec
btrfs_verify_data_csum+0xf4/0x244 [btrfs]
end_bio_extent_readpage+0x1d0/0x6b0 [btrfs]
bio_endio+0x15c/0x1dc
end_workqueue_fn+0x44/0x64 [btrfs]
btrfs_work_helper+0x74/0x250 [btrfs]
process_one_work+0x1d4/0x47c
worker_thread+0x180/0x400
kthread+0x11c/0x120
ret_from_fork+0x10/0x30
Code: 54000261 d100044c d343fd8c f8408403 (f8408424)
---[ end trace 9e2c59f33ea40866 ]---
[CAUSE]
When reading two compressed extents inside the same page, like the
following layout, we trigger above crash:
0 32K 64K
|-------|\\\\\\\|
| \- Compressed extent (A)
\--------- Compressed extent (B)
For compressed read, we don't need to populate its io_bio->csum, as we
rely on compressed_bio->csum to verify the compressed data, and then
copy the decompressed to inode pages.
Normally btrfs_verify_data_csum() skip such page by checking and
clearing its PageChecked flag
But since that flag is still for the full page, when endio for inode
page range [0, 32K) gets executed, it clears PageChecked flag for the
full page.
Then when endio for inode page range [32K, 64K) gets executed, since the
page no longer has PageChecked flag, it just continues checking, even
though io_bio->csum is NULL.
[FIX]
Thankfully there are only two users of PageChecked bit:
- Cow fixup
Since subpage has its own way to trace page dirty (dirty_bitmap) and
ordered bit (ordered_bitmap), it should never trigger cow fixup.
- Compressed read
We can distinguish such read by just checking io_bio->csum.
So just check io_bio->csum before doing the verification to avoid such
NULL pointer dereference.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-26 14:34:51 +08:00
|
|
|
/*
|
2021-09-27 15:21:49 +08:00
|
|
|
* This only happens for NODATASUM or compressed read.
|
|
|
|
|
* Normally this should be covered by above check for compressed read
|
|
|
|
|
* or the next check for NODATASUM. Just do a quicker exit here.
|
btrfs: subpage: check if there are compressed extents inside one page
[BUG]
When testing experimental subpage compressed write support, it hits a
NULL pointer dereference inside read path:
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000018
pc : __pi_memcmp+0x28/0x1ec
lr : check_data_csum+0xd0/0x274 [btrfs]
Call trace:
__pi_memcmp+0x28/0x1ec
btrfs_verify_data_csum+0xf4/0x244 [btrfs]
end_bio_extent_readpage+0x1d0/0x6b0 [btrfs]
bio_endio+0x15c/0x1dc
end_workqueue_fn+0x44/0x64 [btrfs]
btrfs_work_helper+0x74/0x250 [btrfs]
process_one_work+0x1d4/0x47c
worker_thread+0x180/0x400
kthread+0x11c/0x120
ret_from_fork+0x10/0x30
Code: 54000261 d100044c d343fd8c f8408403 (f8408424)
---[ end trace 9e2c59f33ea40866 ]---
[CAUSE]
When reading two compressed extents inside the same page, like the
following layout, we trigger above crash:
0 32K 64K
|-------|\\\\\\\|
| \- Compressed extent (A)
\--------- Compressed extent (B)
For compressed read, we don't need to populate its io_bio->csum, as we
rely on compressed_bio->csum to verify the compressed data, and then
copy the decompressed to inode pages.
Normally btrfs_verify_data_csum() skip such page by checking and
clearing its PageChecked flag
But since that flag is still for the full page, when endio for inode
page range [0, 32K) gets executed, it clears PageChecked flag for the
full page.
Then when endio for inode page range [32K, 64K) gets executed, since the
page no longer has PageChecked flag, it just continues checking, even
though io_bio->csum is NULL.
[FIX]
Thankfully there are only two users of PageChecked bit:
- Cow fixup
Since subpage has its own way to trace page dirty (dirty_bitmap) and
ordered bit (ordered_bitmap), it should never trigger cow fixup.
- Compressed read
We can distinguish such read by just checking io_bio->csum.
So just check io_bio->csum before doing the verification to avoid such
NULL pointer dereference.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-26 14:34:51 +08:00
|
|
|
*/
|
2021-09-15 15:17:18 +08:00
|
|
|
if (bbio->csum == NULL)
|
2014-09-12 18:43:55 +08:00
|
|
|
return 0;
|
2008-12-12 10:03:38 -05:00
|
|
|
|
2009-04-17 10:37:41 +02:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
2020-10-16 11:29:18 -04:00
|
|
|
return 0;
|
|
|
|
|
|
2021-11-05 16:45:47 -04:00
|
|
|
if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
|
2007-12-14 15:30:32 -05:00
|
|
|
return 0;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 16:58:54 -05:00
|
|
|
|
2020-12-02 14:47:59 +08:00
|
|
|
ASSERT(page_offset(page) <= start &&
|
|
|
|
|
end <= page_offset(page) + PAGE_SIZE - 1);
|
|
|
|
|
for (pg_off = offset_in_page(start);
|
|
|
|
|
pg_off < offset_in_page(end);
|
|
|
|
|
pg_off += sectorsize, bio_offset += sectorsize) {
|
btrfs: subpage: fix false alert when relocating partial preallocated data extents
[BUG]
When relocating partial preallocated data extents (part of the
preallocated extent is written) for subpage, it can cause the following
false alert and make the relocation to fail:
BTRFS info (device dm-3): balance: start -d
BTRFS info (device dm-3): relocating block group 13631488 flags data
BTRFS warning (device dm-3): csum failed root -9 ino 257 off 4096 csum 0x98757625 expected csum 0x00000000 mirror 1
BTRFS error (device dm-3): bdev /dev/mapper/arm_nvme-test errs: wr 0, rd 0, flush 0, corrupt 1, gen 0
BTRFS warning (device dm-3): csum failed root -9 ino 257 off 4096 csum 0x98757625 expected csum 0x00000000 mirror 1
BTRFS error (device dm-3): bdev /dev/mapper/arm_nvme-test errs: wr 0, rd 0, flush 0, corrupt 2, gen 0
BTRFS info (device dm-3): balance: ended with status: -5
The minimal script to reproduce looks like this:
mkfs.btrfs -f -s 4k $dev
mount $dev -o nospace_cache $mnt
xfs_io -f -c "falloc 0 8k" $mnt/file
xfs_io -f -c "pwrite 0 4k" $mnt/file
btrfs balance start -d $mnt
[CAUSE]
Function btrfs_verify_data_csum() checks if the full range has
EXTENT_NODATASUM bit for data reloc inode, if *all* bytes of the range
have EXTENT_NODATASUM bit, then it skip the range.
This works pretty well for regular sectorsize, as in that case
btrfs_verify_data_csum() is called for each sector, thus no problem at
all.
But for subpage case, btrfs_verify_data_csum() is called on each bvec,
which can contain several sectors, and since it checks *all* bytes for
EXTENT_NODATASUM bit, if we have some range with csum, then we will
continue checking all the sectors.
For the preallocated sectors, it doesn't have any csum, thus obviously
the csum won't match and cause the false alert.
[FIX]
Move the EXTENT_NODATASUM check into the main loop, so that we can check
each sector for EXTENT_NODATASUM bit for subpage case.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-26 14:35:04 +08:00
|
|
|
u64 file_offset = pg_off + page_offset(page);
|
2020-12-02 14:47:59 +08:00
|
|
|
int ret;
|
|
|
|
|
|
2021-09-09 01:19:25 +09:00
|
|
|
if (btrfs_is_data_reloc_root(root) &&
|
btrfs: subpage: fix false alert when relocating partial preallocated data extents
[BUG]
When relocating partial preallocated data extents (part of the
preallocated extent is written) for subpage, it can cause the following
false alert and make the relocation to fail:
BTRFS info (device dm-3): balance: start -d
BTRFS info (device dm-3): relocating block group 13631488 flags data
BTRFS warning (device dm-3): csum failed root -9 ino 257 off 4096 csum 0x98757625 expected csum 0x00000000 mirror 1
BTRFS error (device dm-3): bdev /dev/mapper/arm_nvme-test errs: wr 0, rd 0, flush 0, corrupt 1, gen 0
BTRFS warning (device dm-3): csum failed root -9 ino 257 off 4096 csum 0x98757625 expected csum 0x00000000 mirror 1
BTRFS error (device dm-3): bdev /dev/mapper/arm_nvme-test errs: wr 0, rd 0, flush 0, corrupt 2, gen 0
BTRFS info (device dm-3): balance: ended with status: -5
The minimal script to reproduce looks like this:
mkfs.btrfs -f -s 4k $dev
mount $dev -o nospace_cache $mnt
xfs_io -f -c "falloc 0 8k" $mnt/file
xfs_io -f -c "pwrite 0 4k" $mnt/file
btrfs balance start -d $mnt
[CAUSE]
Function btrfs_verify_data_csum() checks if the full range has
EXTENT_NODATASUM bit for data reloc inode, if *all* bytes of the range
have EXTENT_NODATASUM bit, then it skip the range.
This works pretty well for regular sectorsize, as in that case
btrfs_verify_data_csum() is called for each sector, thus no problem at
all.
But for subpage case, btrfs_verify_data_csum() is called on each bvec,
which can contain several sectors, and since it checks *all* bytes for
EXTENT_NODATASUM bit, if we have some range with csum, then we will
continue checking all the sectors.
For the preallocated sectors, it doesn't have any csum, thus obviously
the csum won't match and cause the false alert.
[FIX]
Move the EXTENT_NODATASUM check into the main loop, so that we can check
each sector for EXTENT_NODATASUM bit for subpage case.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-26 14:35:04 +08:00
|
|
|
test_range_bit(io_tree, file_offset,
|
|
|
|
|
file_offset + sectorsize - 1,
|
|
|
|
|
EXTENT_NODATASUM, 1, NULL)) {
|
|
|
|
|
/* Skip the range without csum for data reloc inode */
|
|
|
|
|
clear_extent_bits(io_tree, file_offset,
|
|
|
|
|
file_offset + sectorsize - 1,
|
|
|
|
|
EXTENT_NODATASUM);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2022-07-07 07:33:29 +02:00
|
|
|
ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
|
btrfs: make btrfs_verify_data_csum() to return a bitmap
This will provide the basis for later per-sector repair for subpage,
while still keeping the existing code happy.
As if all csums match, the return value will be 0, same as now.
Only when csum mismatches, the return value is different.
The new return value will be a bitmap, for 4K sectorsize and 4K page
size, it will be either 1, instead of the -EIO (which is not used
directly by the callers, no effective change).
But for 4K sectorsize and 64K page size, aka subpage case, since the
bvec can contain multiple sectors, knowing which sectors are corrupted
will allow us to submit repair only for corrupted sectors.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-05-03 10:08:54 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
|
const int nr_bit = (pg_off - offset_in_page(start)) >>
|
|
|
|
|
root->fs_info->sectorsize_bits;
|
|
|
|
|
|
|
|
|
|
result |= (1U << nr_bit);
|
|
|
|
|
}
|
2020-12-02 14:47:59 +08:00
|
|
|
}
|
btrfs: make btrfs_verify_data_csum() to return a bitmap
This will provide the basis for later per-sector repair for subpage,
while still keeping the existing code happy.
As if all csums match, the return value will be 0, same as now.
Only when csum mismatches, the return value is different.
The new return value will be a bitmap, for 4K sectorsize and 4K page
size, it will be either 1, instead of the -EIO (which is not used
directly by the callers, no effective change).
But for 4K sectorsize and 64K page size, aka subpage case, since the
bvec can contain multiple sectors, knowing which sectors are corrupted
will allow us to submit repair only for corrupted sectors.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-05-03 10:08:54 +08:00
|
|
|
return result;
|
2007-08-30 08:50:51 -04:00
|
|
|
}
|
2007-08-27 16:49:44 -04:00
|
|
|
|
2018-01-16 09:31:58 +02:00
|
|
|
/*
|
|
|
|
|
* btrfs_add_delayed_iput - perform a delayed iput on @inode
|
|
|
|
|
*
|
|
|
|
|
* @inode: The inode we want to perform iput on
|
|
|
|
|
*
|
|
|
|
|
* This function uses the generic vfs_inode::i_count to track whether we should
|
|
|
|
|
* just decrement it (in case it's > 1) or if this is the last iput then link
|
|
|
|
|
* the inode to the delayed iput machinery. Delayed iputs are processed at
|
|
|
|
|
* transaction commit time/superblock commit/cleaner kthread.
|
|
|
|
|
*/
|
2009-11-12 09:36:34 +00:00
|
|
|
void btrfs_add_delayed_iput(struct inode *inode)
|
|
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2015-11-19 14:15:51 +01:00
|
|
|
struct btrfs_inode *binode = BTRFS_I(inode);
|
2009-11-12 09:36:34 +00:00
|
|
|
|
|
|
|
|
if (atomic_add_unless(&inode->i_count, -1, 1))
|
|
|
|
|
return;
|
|
|
|
|
|
2018-12-03 11:06:52 -05:00
|
|
|
atomic_inc(&fs_info->nr_delayed_iputs);
|
2009-11-12 09:36:34 +00:00
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2018-01-16 09:31:58 +02:00
|
|
|
ASSERT(list_empty(&binode->delayed_iput));
|
|
|
|
|
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
|
2009-11-12 09:36:34 +00:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
2019-01-11 10:21:02 -05:00
|
|
|
if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
|
|
|
|
|
wake_up_process(fs_info->cleaner_kthread);
|
2009-11-12 09:36:34 +00:00
|
|
|
}
|
|
|
|
|
|
2019-06-18 10:59:18 -04:00
|
|
|
static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
|
|
|
|
|
struct btrfs_inode *inode)
|
|
|
|
|
{
|
|
|
|
|
list_del_init(&inode->delayed_iput);
|
|
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
|
iput(&inode->vfs_inode);
|
|
|
|
|
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
|
|
|
|
|
wake_up(&fs_info->delayed_iputs_wait);
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
|
|
|
|
|
struct btrfs_inode *inode)
|
|
|
|
|
{
|
|
|
|
|
if (!list_empty(&inode->delayed_iput)) {
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
|
|
|
|
if (!list_empty(&inode->delayed_iput))
|
|
|
|
|
run_delayed_iput_locked(fs_info, inode);
|
|
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
|
2009-11-12 09:36:34 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2015-11-19 14:15:51 +01:00
|
|
|
while (!list_empty(&fs_info->delayed_iputs)) {
|
|
|
|
|
struct btrfs_inode *inode;
|
|
|
|
|
|
|
|
|
|
inode = list_first_entry(&fs_info->delayed_iputs,
|
|
|
|
|
struct btrfs_inode, delayed_iput);
|
2019-06-18 10:59:18 -04:00
|
|
|
run_delayed_iput_locked(fs_info, inode);
|
2021-04-29 10:51:34 -04:00
|
|
|
cond_resched_lock(&fs_info->delayed_iput_lock);
|
2009-11-12 09:36:34 +00:00
|
|
|
}
|
2015-11-19 14:15:51 +01:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
2009-11-12 09:36:34 +00:00
|
|
|
}
|
|
|
|
|
|
2018-12-03 11:06:52 -05:00
|
|
|
/**
|
2021-01-22 11:57:59 +02:00
|
|
|
* Wait for flushing all delayed iputs
|
|
|
|
|
*
|
|
|
|
|
* @fs_info: the filesystem
|
2018-12-03 11:06:52 -05:00
|
|
|
*
|
|
|
|
|
* This will wait on any delayed iputs that are currently running with KILLABLE
|
|
|
|
|
* set. Once they are all done running we will return, unless we are killed in
|
|
|
|
|
* which case we return EINTR. This helps in user operations like fallocate etc
|
|
|
|
|
* that might get blocked on the iputs.
|
2021-01-22 11:57:59 +02:00
|
|
|
*
|
|
|
|
|
* Return EINTR if we were killed, 0 if nothing's pending
|
2018-12-03 11:06:52 -05:00
|
|
|
*/
|
|
|
|
|
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
|
|
|
|
|
{
|
|
|
|
|
int ret = wait_event_killable(fs_info->delayed_iputs_wait,
|
|
|
|
|
atomic_read(&fs_info->nr_delayed_iputs) == 0);
|
|
|
|
|
if (ret)
|
|
|
|
|
return -EINTR;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2008-07-24 12:17:14 -04:00
|
|
|
/*
|
2018-05-11 13:13:32 -07:00
|
|
|
* This creates an orphan entry for the given inode in case something goes wrong
|
|
|
|
|
* in the middle of an unlink.
|
2008-07-24 12:17:14 -04:00
|
|
|
*/
|
2017-02-20 13:50:59 +02:00
|
|
|
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
|
2018-05-11 13:13:37 -07:00
|
|
|
struct btrfs_inode *inode)
|
2008-07-24 12:17:14 -04:00
|
|
|
{
|
2010-05-16 10:49:58 -04:00
|
|
|
int ret;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2018-05-11 13:13:37 -07:00
|
|
|
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
|
|
|
|
|
if (ret && ret != -EEXIST) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
return ret;
|
2010-05-16 10:49:58 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
2008-07-24 12:17:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2018-05-11 13:13:32 -07:00
|
|
|
* We have done the delete so we can go ahead and remove the orphan item for
|
|
|
|
|
* this particular inode.
|
2008-07-24 12:17:14 -04:00
|
|
|
*/
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
|
2017-02-20 13:50:58 +02:00
|
|
|
struct btrfs_inode *inode)
|
2008-07-24 12:17:14 -04:00
|
|
|
{
|
2018-05-11 13:13:37 -07:00
|
|
|
return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
|
2008-07-24 12:17:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* this cleans up any orphans that may be left on the list from the last use
|
|
|
|
|
* of this root.
|
|
|
|
|
*/
|
2011-01-31 16:22:42 -05:00
|
|
|
int btrfs_orphan_cleanup(struct btrfs_root *root)
|
2008-07-24 12:17:14 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-07-24 12:17:14 -04:00
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
struct inode *inode;
|
2011-09-26 15:55:20 -04:00
|
|
|
u64 last_objectid = 0;
|
2018-05-11 13:13:32 -07:00
|
|
|
int ret = 0, nr_unlink = 0;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2021-11-09 10:12:06 -05:00
|
|
|
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
|
2011-01-31 16:22:42 -05:00
|
|
|
return 0;
|
2009-11-12 09:34:40 +00:00
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-01-31 16:22:42 -05:00
|
|
|
if (!path) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2015-11-27 16:31:35 +01:00
|
|
|
path->reada = READA_BACK;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
|
|
|
|
key.objectid = BTRFS_ORPHAN_OBJECTID;
|
2014-06-04 18:41:45 +02:00
|
|
|
key.type = BTRFS_ORPHAN_ITEM_KEY;
|
2008-07-24 12:17:14 -04:00
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
2011-01-31 16:22:42 -05:00
|
|
|
if (ret < 0)
|
|
|
|
|
goto out;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* if ret == 0 means we found what we were searching for, which
|
2011-03-30 22:57:33 -03:00
|
|
|
* is weird, but possible, so only screw with path if we didn't
|
2008-07-24 12:17:14 -04:00
|
|
|
* find the key and see if we have stuff that matches
|
|
|
|
|
*/
|
|
|
|
|
if (ret > 0) {
|
2011-01-31 16:22:42 -05:00
|
|
|
ret = 0;
|
2008-07-24 12:17:14 -04:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
|
break;
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* pull out the item */
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
|
|
|
|
/* make sure the item matches what we want */
|
|
|
|
|
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
|
|
|
|
|
break;
|
2014-06-04 18:41:45 +02:00
|
|
|
if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
|
2008-07-24 12:17:14 -04:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
/* release the path since we're done with it */
|
2011-04-21 01:20:15 +02:00
|
|
|
btrfs_release_path(path);
|
2008-07-24 12:17:14 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* this is where we are basically btrfs_lookup, without the
|
|
|
|
|
* crossing root thing. we store the inode number in the
|
|
|
|
|
* offset of the orphan item.
|
|
|
|
|
*/
|
2011-09-26 15:55:20 -04:00
|
|
|
|
|
|
|
|
if (found_key.offset == last_objectid) {
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
|
"Error removing orphan entry, stopping orphan cleanup");
|
2011-09-26 15:55:20 -04:00
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
last_objectid = found_key.offset;
|
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
found_key.objectid = found_key.offset;
|
|
|
|
|
found_key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
|
found_key.offset = 0;
|
2020-05-15 19:35:59 +02:00
|
|
|
inode = btrfs_iget(fs_info->sb, last_objectid, root);
|
2013-07-15 11:20:32 +09:30
|
|
|
ret = PTR_ERR_OR_ZERO(inode);
|
2016-06-06 11:51:25 +01:00
|
|
|
if (ret && ret != -ENOENT)
|
2011-01-31 16:22:42 -05:00
|
|
|
goto out;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
if (ret == -ENOENT && root == fs_info->tree_root) {
|
2011-12-14 20:12:02 -05:00
|
|
|
struct btrfs_root *dead_root;
|
|
|
|
|
int is_dead_root = 0;
|
|
|
|
|
|
|
|
|
|
/*
|
2021-03-16 16:54:13 +00:00
|
|
|
* This is an orphan in the tree root. Currently these
|
2011-12-14 20:12:02 -05:00
|
|
|
* could come from 2 sources:
|
2021-03-16 16:54:13 +00:00
|
|
|
* a) a root (snapshot/subvolume) deletion in progress
|
2011-12-14 20:12:02 -05:00
|
|
|
* b) a free space cache inode
|
2021-03-16 16:54:13 +00:00
|
|
|
* We need to distinguish those two, as the orphan item
|
|
|
|
|
* for a root must not get deleted before the deletion
|
|
|
|
|
* of the snapshot/subvolume's tree completes.
|
|
|
|
|
*
|
|
|
|
|
* btrfs_find_orphan_roots() ran before us, which has
|
|
|
|
|
* found all deleted roots and loaded them into
|
2022-07-15 13:59:21 +02:00
|
|
|
* fs_info->fs_roots_radix. So here we can find if an
|
2021-03-16 16:54:13 +00:00
|
|
|
* orphan item corresponds to a deleted root by looking
|
2022-07-15 13:59:21 +02:00
|
|
|
* up the root from that radix tree.
|
2011-12-14 20:12:02 -05:00
|
|
|
*/
|
btrfs: speedup dead root detection during orphan cleanup
When mounting, we handle deleted subvolume and orphan items. First,
find add orphan roots, then add them to fs_root radix tree. Second, in
tree-root, process each orphan item, skip if it is dead root.
The original algorithm is based on the list of dead_roots, one by one to
visit and check whether the objectid is consistent, the time complexity
is O (n ^ 2). When processing 50000 deleted subvols, it takes about
120s.
Because btrfs_find_orphan_roots has already ran before us, and added
deleted subvol to fs_roots radix tree.
The fs root will only be removed from the fs_roots radix tree after the
cleaner process is started, and the cleaner will only start execution
after the mount is complete.
btrfs_orphan_cleanup can be called during the whole filesystem mount
lifetime, but only "tree root" will be used in this section of code, and
only mount time will be brought into tree root.
So we can quickly check whether the orphan item is dead root through the
fs_roots radix tree.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-07 10:54:40 +08:00
|
|
|
|
2022-07-15 13:59:21 +02:00
|
|
|
spin_lock(&fs_info->fs_roots_radix_lock);
|
|
|
|
|
dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
|
|
|
|
|
(unsigned long)found_key.objectid);
|
btrfs: speedup dead root detection during orphan cleanup
When mounting, we handle deleted subvolume and orphan items. First,
find add orphan roots, then add them to fs_root radix tree. Second, in
tree-root, process each orphan item, skip if it is dead root.
The original algorithm is based on the list of dead_roots, one by one to
visit and check whether the objectid is consistent, the time complexity
is O (n ^ 2). When processing 50000 deleted subvols, it takes about
120s.
Because btrfs_find_orphan_roots has already ran before us, and added
deleted subvol to fs_roots radix tree.
The fs root will only be removed from the fs_roots radix tree after the
cleaner process is started, and the cleaner will only start execution
after the mount is complete.
btrfs_orphan_cleanup can be called during the whole filesystem mount
lifetime, but only "tree root" will be used in this section of code, and
only mount time will be brought into tree root.
So we can quickly check whether the orphan item is dead root through the
fs_roots radix tree.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-07 10:54:40 +08:00
|
|
|
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
|
|
|
|
|
is_dead_root = 1;
|
2022-07-15 13:59:21 +02:00
|
|
|
spin_unlock(&fs_info->fs_roots_radix_lock);
|
btrfs: speedup dead root detection during orphan cleanup
When mounting, we handle deleted subvolume and orphan items. First,
find add orphan roots, then add them to fs_root radix tree. Second, in
tree-root, process each orphan item, skip if it is dead root.
The original algorithm is based on the list of dead_roots, one by one to
visit and check whether the objectid is consistent, the time complexity
is O (n ^ 2). When processing 50000 deleted subvols, it takes about
120s.
Because btrfs_find_orphan_roots has already ran before us, and added
deleted subvol to fs_roots radix tree.
The fs root will only be removed from the fs_roots radix tree after the
cleaner process is started, and the cleaner will only start execution
after the mount is complete.
btrfs_orphan_cleanup can be called during the whole filesystem mount
lifetime, but only "tree root" will be used in this section of code, and
only mount time will be brought into tree root.
So we can quickly check whether the orphan item is dead root through the
fs_roots radix tree.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-07 10:54:40 +08:00
|
|
|
|
2011-12-14 20:12:02 -05:00
|
|
|
if (is_dead_root) {
|
|
|
|
|
/* prevent this orphan from being found again */
|
|
|
|
|
key.offset = found_key.objectid - 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2018-05-11 13:13:32 -07:00
|
|
|
|
2011-12-14 20:12:02 -05:00
|
|
|
}
|
2018-05-11 13:13:32 -07:00
|
|
|
|
2008-07-24 12:17:14 -04:00
|
|
|
/*
|
2018-05-11 13:13:32 -07:00
|
|
|
* If we have an inode with links, there are a couple of
|
btrfs: verity metadata orphan items
Writing out the verity data is too large of an operation to do in a
single transaction. If we are interrupted before we finish creating
fsverity metadata for a file, or fail to clean up already created
metadata after a failure, we could leak the verity items that we already
committed.
To address this issue, we use the orphan mechanism. When we start
enabling verity on a file, we also add an orphan item for that inode.
When we are finished, we delete the orphan. However, if we are
interrupted midway, the orphan will be present at mount and we can
cleanup the half-formed verity state.
There is a possible race with a normal unlink operation: if unlink and
verity run on the same file in parallel, it is possible for verity to
succeed and delete the still legitimate orphan added by unlink. Then, if
we are interrupted and mount in that state, we will never clean up the
inode properly. This is also possible for a file created with O_TMPFILE.
Check nlink==0 before deleting to avoid this race.
A final thing to note is that this is a resurrection of using orphans to
signal an operation besides "delete this inode". The old case was to
signal the need to do a truncate. That case still technically applies
for mounting very old file systems, so we need to take some care to not
clobber it. To that end, we just have to be careful that verity orphan
cleanup is a no-op for non-verity files.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:50 -07:00
|
|
|
* possibilities:
|
|
|
|
|
*
|
|
|
|
|
* 1. We were halfway through creating fsverity metadata for the
|
|
|
|
|
* file. In that case, the orphan item represents incomplete
|
|
|
|
|
* fsverity metadata which must be cleaned up with
|
|
|
|
|
* btrfs_drop_verity_items and deleting the orphan item.
|
|
|
|
|
|
|
|
|
|
* 2. Old kernels (before v3.12) used to create an
|
2018-05-11 13:13:32 -07:00
|
|
|
* orphan item for truncate indicating that there were possibly
|
|
|
|
|
* extent items past i_size that needed to be deleted. In v3.12,
|
|
|
|
|
* truncate was changed to update i_size in sync with the extent
|
|
|
|
|
* items, but the (useless) orphan item was still created. Since
|
|
|
|
|
* v4.18, we don't create the orphan item for truncate at all.
|
|
|
|
|
*
|
|
|
|
|
* So, this item could mean that we need to do a truncate, but
|
|
|
|
|
* only if this filesystem was last used on a pre-v3.12 kernel
|
|
|
|
|
* and was not cleanly unmounted. The odds of that are quite
|
|
|
|
|
* slim, and it's a pain to do the truncate now, so just delete
|
|
|
|
|
* the orphan item.
|
|
|
|
|
*
|
|
|
|
|
* It's also possible that this orphan item was supposed to be
|
|
|
|
|
* deleted but wasn't. The inode number may have been reused,
|
|
|
|
|
* but either way, we can delete the orphan item.
|
2008-07-24 12:17:14 -04:00
|
|
|
*/
|
2018-05-11 13:13:32 -07:00
|
|
|
if (ret == -ENOENT || inode->i_nlink) {
|
btrfs: verity metadata orphan items
Writing out the verity data is too large of an operation to do in a
single transaction. If we are interrupted before we finish creating
fsverity metadata for a file, or fail to clean up already created
metadata after a failure, we could leak the verity items that we already
committed.
To address this issue, we use the orphan mechanism. When we start
enabling verity on a file, we also add an orphan item for that inode.
When we are finished, we delete the orphan. However, if we are
interrupted midway, the orphan will be present at mount and we can
cleanup the half-formed verity state.
There is a possible race with a normal unlink operation: if unlink and
verity run on the same file in parallel, it is possible for verity to
succeed and delete the still legitimate orphan added by unlink. Then, if
we are interrupted and mount in that state, we will never clean up the
inode properly. This is also possible for a file created with O_TMPFILE.
Check nlink==0 before deleting to avoid this race.
A final thing to note is that this is a resurrection of using orphans to
signal an operation besides "delete this inode". The old case was to
signal the need to do a truncate. That case still technically applies
for mounting very old file systems, so we need to take some care to not
clobber it. To that end, we just have to be careful that verity orphan
cleanup is a no-op for non-verity files.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:50 -07:00
|
|
|
if (!ret) {
|
|
|
|
|
ret = btrfs_drop_verity_items(BTRFS_I(inode));
|
2018-05-11 13:13:32 -07:00
|
|
|
iput(inode);
|
btrfs: verity metadata orphan items
Writing out the verity data is too large of an operation to do in a
single transaction. If we are interrupted before we finish creating
fsverity metadata for a file, or fail to clean up already created
metadata after a failure, we could leak the verity items that we already
committed.
To address this issue, we use the orphan mechanism. When we start
enabling verity on a file, we also add an orphan item for that inode.
When we are finished, we delete the orphan. However, if we are
interrupted midway, the orphan will be present at mount and we can
cleanup the half-formed verity state.
There is a possible race with a normal unlink operation: if unlink and
verity run on the same file in parallel, it is possible for verity to
succeed and delete the still legitimate orphan added by unlink. Then, if
we are interrupted and mount in that state, we will never clean up the
inode properly. This is also possible for a file created with O_TMPFILE.
Check nlink==0 before deleting to avoid this race.
A final thing to note is that this is a resurrection of using orphans to
signal an operation besides "delete this inode". The old case was to
signal the need to do a truncate. That case still technically applies
for mounting very old file systems, so we need to take some care to not
clobber it. To that end, we just have to be careful that verity orphan
cleanup is a no-op for non-verity files.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:50 -07:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2011-09-21 16:55:59 -04:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2011-01-31 16:22:42 -05:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_debug(fs_info, "auto deleting %Lu",
|
|
|
|
|
found_key.objectid);
|
2011-09-21 16:55:59 -04:00
|
|
|
ret = btrfs_del_orphan_item(trans, root,
|
|
|
|
|
found_key.objectid);
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2013-08-13 14:10:08 -04:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
2008-07-24 12:17:14 -04:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-11 13:13:32 -07:00
|
|
|
nr_unlink++;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
|
|
|
|
/* this will do delete_inode and everything for us */
|
|
|
|
|
iput(inode);
|
|
|
|
|
}
|
2011-11-10 20:45:05 -05:00
|
|
|
/* release the path since we're done with it */
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
2018-05-11 13:13:38 -07:00
|
|
|
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
|
2011-04-13 12:54:33 -04:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-31 16:22:42 -05:00
|
|
|
if (!IS_ERR(trans))
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2010-05-16 10:49:58 -04:00
|
|
|
}
|
2008-07-24 12:17:14 -04:00
|
|
|
|
|
|
|
|
if (nr_unlink)
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
|
2011-01-31 16:22:42 -05:00
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
if (ret)
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
|
2011-01-31 16:22:42 -05:00
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
2008-07-24 12:17:14 -04:00
|
|
|
}
|
|
|
|
|
|
2009-04-27 11:47:50 -04:00
|
|
|
/*
|
|
|
|
|
* very simple check to peek ahead in the leaf looking for xattrs. If we
|
|
|
|
|
* don't find any xattrs, we know there can't be any acls.
|
|
|
|
|
*
|
|
|
|
|
* slot is the slot the inode is in, objectid is the objectid of the inode
|
|
|
|
|
*/
|
|
|
|
|
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
int slot, u64 objectid,
|
|
|
|
|
int *first_xattr_slot)
|
2009-04-27 11:47:50 -04:00
|
|
|
{
|
|
|
|
|
u32 nritems = btrfs_header_nritems(leaf);
|
|
|
|
|
struct btrfs_key found_key;
|
2013-06-19 10:16:26 -04:00
|
|
|
static u64 xattr_access = 0;
|
|
|
|
|
static u64 xattr_default = 0;
|
2009-04-27 11:47:50 -04:00
|
|
|
int scanned = 0;
|
|
|
|
|
|
2013-06-19 10:16:26 -04:00
|
|
|
if (!xattr_access) {
|
2015-12-02 14:44:35 +01:00
|
|
|
xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
|
|
|
|
|
strlen(XATTR_NAME_POSIX_ACL_ACCESS));
|
|
|
|
|
xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
|
|
|
|
|
strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
|
2013-06-19 10:16:26 -04:00
|
|
|
}
|
|
|
|
|
|
2009-04-27 11:47:50 -04:00
|
|
|
slot++;
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
*first_xattr_slot = -1;
|
2009-04-27 11:47:50 -04:00
|
|
|
while (slot < nritems) {
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
|
|
/* we found a different objectid, there must not be acls */
|
|
|
|
|
if (found_key.objectid != objectid)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
/* we found an xattr, assume we've got an acl */
|
2013-06-19 10:16:26 -04:00
|
|
|
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
if (*first_xattr_slot == -1)
|
|
|
|
|
*first_xattr_slot = slot;
|
2013-06-19 10:16:26 -04:00
|
|
|
if (found_key.offset == xattr_access ||
|
|
|
|
|
found_key.offset == xattr_default)
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
2009-04-27 11:47:50 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* we found a key greater than an xattr key, there can't
|
|
|
|
|
* be any acls later on
|
|
|
|
|
*/
|
|
|
|
|
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
slot++;
|
|
|
|
|
scanned++;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* it goes inode, inode backrefs, xattrs, extents,
|
|
|
|
|
* so if there are a ton of hard links to an inode there can
|
|
|
|
|
* be a lot of backrefs. Don't waste time searching too hard,
|
|
|
|
|
* this is just an optimization
|
|
|
|
|
*/
|
|
|
|
|
if (scanned >= 8)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
/* we hit the end of the leaf before we found an xattr or
|
|
|
|
|
* something larger than an xattr. We have to assume the inode
|
|
|
|
|
* has acls
|
|
|
|
|
*/
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
if (*first_xattr_slot == -1)
|
|
|
|
|
*first_xattr_slot = slot;
|
2009-04-27 11:47:50 -04:00
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* read an inode from the btree into the in-memory inode
|
|
|
|
|
*/
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
static int btrfs_read_locked_inode(struct inode *inode,
|
|
|
|
|
struct btrfs_path *in_path)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
struct btrfs_path *path = in_path;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_key location;
|
2013-12-26 13:07:06 +08:00
|
|
|
unsigned long ptr;
|
2009-04-27 11:47:50 -04:00
|
|
|
int maybe_acls;
|
2007-07-11 10:18:17 -04:00
|
|
|
u32 rdev;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret;
|
2011-06-23 07:27:13 +00:00
|
|
|
bool filled = false;
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
int first_xattr_slot;
|
2011-06-23 07:27:13 +00:00
|
|
|
|
|
|
|
|
ret = btrfs_fill_inode(inode, &rdev);
|
|
|
|
|
if (!ret)
|
|
|
|
|
filled = true;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
if (!path) {
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
2011-07-12 11:25:31 -07:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
|
2008-01-08 15:46:30 -05:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
|
2016-06-06 11:51:25 +01:00
|
|
|
if (ret) {
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
if (path != in_path)
|
|
|
|
|
btrfs_free_path(path);
|
2018-07-29 23:04:51 +01:00
|
|
|
return ret;
|
2016-06-06 11:51:25 +01:00
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
leaf = path->nodes[0];
|
2011-06-23 07:27:13 +00:00
|
|
|
|
|
|
|
|
if (filled)
|
2013-12-26 13:07:06 +08:00
|
|
|
goto cache_index;
|
2011-06-23 07:27:13 +00:00
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
|
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
|
2011-10-28 14:13:29 +02:00
|
|
|
set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
|
2012-02-10 11:05:07 -08:00
|
|
|
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
|
|
|
|
|
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
|
2017-02-20 13:50:34 +02:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
|
2020-01-17 09:02:21 -05:00
|
|
|
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
|
|
|
|
|
round_up(i_size_read(inode), fs_info->sectorsize));
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2014-12-12 17:39:12 +01:00
|
|
|
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
|
|
|
|
|
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2014-12-12 17:39:12 +01:00
|
|
|
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
|
|
|
|
|
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2014-12-12 17:39:12 +01:00
|
|
|
inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
|
|
|
|
|
inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2012-07-04 12:48:07 +05:30
|
|
|
BTRFS_I(inode)->i_otime.tv_sec =
|
|
|
|
|
btrfs_timespec_sec(leaf, &inode_item->otime);
|
|
|
|
|
BTRFS_I(inode)->i_otime.tv_nsec =
|
|
|
|
|
btrfs_timespec_nsec(leaf, &inode_item->otime);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2008-10-09 11:46:29 -04:00
|
|
|
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
|
2008-09-05 16:13:11 -04:00
|
|
|
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
|
|
|
|
|
|
2017-12-11 06:35:12 -05:00
|
|
|
inode_set_iversion_queried(inode,
|
|
|
|
|
btrfs_inode_sequence(leaf, inode_item));
|
2015-04-09 12:08:43 +08:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
|
|
|
|
inode->i_rdev = 0;
|
|
|
|
|
rdev = btrfs_inode_rdev(leaf, inode_item);
|
|
|
|
|
|
|
|
|
|
BTRFS_I(inode)->index_cnt = (u64)-1;
|
btrfs: add ro compat flags to inodes
Currently, inode flags are fully backwards incompatible in btrfs. If we
introduce a new inode flag, then tree-checker will detect it and fail.
This can even cause us to fail to mount entirely. To make it possible to
introduce new flags which can be read-only compatible, like VERITY, we
add new ro flags to btrfs without treating them quite so harshly in
tree-checker. A read-only file system can survive an unexpected flag,
and can be mounted.
As for the implementation, it unfortunately gets a little complicated.
The on-disk representation of the inode, btrfs_inode_item, has an __le64
for flags but the in-memory representation, btrfs_inode, uses a u32.
David Sterba had the nice idea that we could reclaim those wasted 32 bits
on disk and use them for the new ro_compat flags.
It turns out that the tree-checker code which checks for unknown flags
is broken, and ignores the upper 32 bits we are hoping to use. The issue
is that the flags use the literal 1 rather than 1ULL, so the flags are
signed ints, and one of them is specifically (1 << 31). As a result, the
mask which ORs the flags is a negative integer on machines where int is
32 bit twos complement. When tree-checker evaluates the expression:
btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)
The mask is something like 0x80000abc, which gets promoted to u64 with
sign extension to 0xffffffff80000abc. Negating that 64 bit mask leaves
all the upper bits zeroed, and we can't detect unexpected flags.
This suggests that we can't use those bits after all. Luckily, we have
good reason to believe that they are zero anyway. Inode flags are
metadata, which is always checksummed, so any bit flips that would
introduce 1s would cause a checksum failure anyway (excluding the
improbable case of the checksum getting corrupted exactly badly).
Further, unless the 1 << 31 flag is used, the cast to u64 of the 32 bit
inode flag should preserve its value and not add leading zeroes
(at least for twos complement). The only place that flag
(BTRFS_INODE_ROOT_ITEM_INIT) is used is in a special inode embedded in
the root item, and indeed for that inode we see 0xffffffff80000000 as
the flags on disk. However, that inode is never seen by tree checker,
nor is it used in a context where verity might be meaningful.
Theoretically, a future ro flag might cause trouble on that inode, so we
should proactively clean up that mess before it does.
With the introduction of the new ro flags, keep two separate unsigned
masks and check them against the appropriate u32. Since we no longer run
afoul of sign extension, this also stops writing out 0xffffffff80000000
in root_item inodes going forward.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:48 -07:00
|
|
|
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
|
|
|
|
|
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
|
2015-04-09 12:08:43 +08:00
|
|
|
|
|
|
|
|
cache_index:
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
/*
|
|
|
|
|
* If we were modified in the current generation and evicted from memory
|
|
|
|
|
* and then re-read we need to do a full sync since we don't have any
|
|
|
|
|
* idea about which extents were modified before we were evicted from
|
|
|
|
|
* cache.
|
2015-04-09 12:08:43 +08:00
|
|
|
*
|
|
|
|
|
* This is required for both inode re-read from disk and delayed inode
|
2022-07-15 13:59:45 +02:00
|
|
|
* in delayed_nodes_tree.
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
*/
|
2016-06-22 18:54:23 -04:00
|
|
|
if (BTRFS_I(inode)->last_trans == fs_info->generation)
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
|
|
Btrfs: fix stale dir entries after unlink, inode eviction and fsync
If we remove a hard link from an inode, the inode gets evicted, then
we fsync the inode and then power fail/crash, when the log tree is
replayed, the parent directory inode still has entries pointing to
the name that no longer exists, while our inode no longer has the
BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected),
leaving the filesystem in an inconsistent state. The stale directory
entries can not be deleted (an attempt to delete them causes -ESTALE
errors), which makes it impossible to delete the parent directory.
This happens because we track the id of the transaction where the last
unlink operation for the inode happened (last_unlink_trans) in an
in-memory only field of the inode, that is, a value that is never
persisted in the inode item stored on the fs/subvol btree. So if an
inode is evicted and loaded again, the value for last_unlink_trans is
set to 0, which prevents the fsync from logging the parent directory
at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans
to the id of the transaction that last modified the inode when we
load the inode. This is a pessimistic approach but it always ensures
correctness with the trade off of ocassional full transaction commits
when an fsync is done against the inode in the same transaction where
it was evicted and reloaded when our inode is a directory and often
logging its parent unnecessarily when our inode is not a directory.
The following test case for fstests triggers the problem:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs generic
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file with 2 hard links.
mkdir $SCRATCH_MNT/testdir
touch $SCRATCH_MNT/testdir/foo
ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
# Make sure everything done so far is durably persisted.
sync
# Now remove one of the links, trigger inode eviction and then fsync
# our inode.
unlink $SCRATCH_MNT/testdir/bar
echo 2 > /proc/sys/vm/drop_caches
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo
# Silently drop all writes on our scratch device to simulate a power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again and mount the fs to trigger log/journal replay.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Now verify our directory entries.
echo "Entries in testdir:"
ls -1 $SCRATCH_MNT/testdir
# If we remove our inode, its parent should become empty and therefore we should
# be able to remove the parent.
rm -f $SCRATCH_MNT/testdir/*
rmdir $SCRATCH_MNT/testdir
_unmount_flakey
# The fstests framework will call fsck against our filesystem which will verify
# that all metadata is in a consistent state.
status=0
exit
The test failed on btrfs with:
generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad)
--- tests/generic/098.out 2015-07-23 18:01:12.616175932 +0100
+++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad 2015-07-23 18:04:58.924138308 +0100
@@ -1,3 +1,6 @@
QA output created by 098
Entries in testdir:
+bar
foo
+rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle
+rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
...
(Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad' to see the entire diff)
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full)
$ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full
(...)
checking fs roots
root 5 inode 258 errors 2001, no inode item, link count wrong
unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref
Checking filesystem on /dev/sdc
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-07-24 00:00:19 +01:00
|
|
|
/*
|
|
|
|
|
* We don't persist the id of the transaction where an unlink operation
|
|
|
|
|
* against the inode was last made. So here we assume the inode might
|
|
|
|
|
* have been evicted, and therefore the exact value of last_unlink_trans
|
|
|
|
|
* lost, and set it to last_trans to avoid metadata inconsistencies
|
|
|
|
|
* between the inode and its parent if the inode is fsync'ed and the log
|
|
|
|
|
* replayed. For example, in the scenario:
|
|
|
|
|
*
|
|
|
|
|
* touch mydir/foo
|
|
|
|
|
* ln mydir/foo mydir/bar
|
|
|
|
|
* sync
|
|
|
|
|
* unlink mydir/bar
|
|
|
|
|
* echo 2 > /proc/sys/vm/drop_caches # evicts inode
|
|
|
|
|
* xfs_io -c fsync mydir/foo
|
|
|
|
|
* <power failure>
|
|
|
|
|
* mount fs, triggers fsync log replay
|
|
|
|
|
*
|
|
|
|
|
* We must make sure that when we fsync our inode foo we also log its
|
|
|
|
|
* parent inode, otherwise after log replay the parent still has the
|
|
|
|
|
* dentry with the "bar" name but our inode foo has a link count of 1
|
|
|
|
|
* and doesn't have an inode ref with the name "bar" anymore.
|
|
|
|
|
*
|
|
|
|
|
* Setting last_unlink_trans to last_trans is a pessimistic approach,
|
2016-05-19 21:18:45 -04:00
|
|
|
* but it guarantees correctness at the expense of occasional full
|
Btrfs: fix stale dir entries after unlink, inode eviction and fsync
If we remove a hard link from an inode, the inode gets evicted, then
we fsync the inode and then power fail/crash, when the log tree is
replayed, the parent directory inode still has entries pointing to
the name that no longer exists, while our inode no longer has the
BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected),
leaving the filesystem in an inconsistent state. The stale directory
entries can not be deleted (an attempt to delete them causes -ESTALE
errors), which makes it impossible to delete the parent directory.
This happens because we track the id of the transaction where the last
unlink operation for the inode happened (last_unlink_trans) in an
in-memory only field of the inode, that is, a value that is never
persisted in the inode item stored on the fs/subvol btree. So if an
inode is evicted and loaded again, the value for last_unlink_trans is
set to 0, which prevents the fsync from logging the parent directory
at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans
to the id of the transaction that last modified the inode when we
load the inode. This is a pessimistic approach but it always ensures
correctness with the trade off of ocassional full transaction commits
when an fsync is done against the inode in the same transaction where
it was evicted and reloaded when our inode is a directory and often
logging its parent unnecessarily when our inode is not a directory.
The following test case for fstests triggers the problem:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs generic
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file with 2 hard links.
mkdir $SCRATCH_MNT/testdir
touch $SCRATCH_MNT/testdir/foo
ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
# Make sure everything done so far is durably persisted.
sync
# Now remove one of the links, trigger inode eviction and then fsync
# our inode.
unlink $SCRATCH_MNT/testdir/bar
echo 2 > /proc/sys/vm/drop_caches
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo
# Silently drop all writes on our scratch device to simulate a power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again and mount the fs to trigger log/journal replay.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Now verify our directory entries.
echo "Entries in testdir:"
ls -1 $SCRATCH_MNT/testdir
# If we remove our inode, its parent should become empty and therefore we should
# be able to remove the parent.
rm -f $SCRATCH_MNT/testdir/*
rmdir $SCRATCH_MNT/testdir
_unmount_flakey
# The fstests framework will call fsck against our filesystem which will verify
# that all metadata is in a consistent state.
status=0
exit
The test failed on btrfs with:
generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad)
--- tests/generic/098.out 2015-07-23 18:01:12.616175932 +0100
+++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad 2015-07-23 18:04:58.924138308 +0100
@@ -1,3 +1,6 @@
QA output created by 098
Entries in testdir:
+bar
foo
+rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle
+rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
...
(Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad' to see the entire diff)
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full)
$ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full
(...)
checking fs roots
root 5 inode 258 errors 2001, no inode item, link count wrong
unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref
Checking filesystem on /dev/sdc
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-07-24 00:00:19 +01:00
|
|
|
* transaction commits on fsync if our inode is a directory, or if our
|
|
|
|
|
* inode is not a directory, logging its parent unnecessarily.
|
|
|
|
|
*/
|
|
|
|
|
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
|
|
|
|
|
|
btrfs: reduce contention on log trees when logging checksums
The possibility of extents being shared (through clone and deduplication
operations) requires special care when logging data checksums, to avoid
having a log tree with different checksum items that cover ranges which
overlap (which resulted in missing checksums after replaying a log tree).
Such problems were fixed in the past by the following commits:
commit 40e046acbd2f ("Btrfs: fix missing data checksums after replaying a
log tree")
commit e289f03ea79b ("btrfs: fix corrupt log due to concurrent fsync of
inodes with shared extents")
Test case generic/588 exercises the scenario solved by the first commit
(purely sequential and deterministic) while test case generic/457 often
triggered the case fixed by the second commit (not deterministic, requires
specific timings under concurrency).
The problems were addressed by deleting, from the log tree, any existing
checksums before logging the new ones. And also by doing the deletion and
logging of the cheksums while locking the checksum range in an extent io
tree (root->log_csum_range), to deal with the case where we have concurrent
fsyncs against files with shared extents.
That however causes more contention on the leaves of a log tree where we
store checksums (and all the nodes in the paths leading to them), even
when we do not have shared extents, or all the shared extents were created
by past transactions. It also adds a bit of contention on the spin lock of
the log_csums_range extent io tree of the log root.
This change adds a 'last_reflink_trans' field to the inode to keep track
of the last transaction where a new extent was shared between inodes
(through clone and deduplication operations). It is updated for both the
source and destination inodes of reflink operations whenever a new extent
(created in the current transaction) becomes shared by the inodes. This
field is kept in memory only, not persisted in the inode item, similar
to other existing fields (last_unlink_trans, logged_trans).
When logging checksums for an extent, if the value of 'last_reflink_trans'
is smaller then the current transaction's generation/id, we skip locking
the extent range and deletion of checksums from the log tree, since we
know we do not have new shared extents. This reduces contention on the
log tree's leaves where checksums are stored.
The following script, which uses fio, was used to measure the impact of
this change:
$ cat test-fsync.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 3 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=write
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=64k
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The tests were performed for different numbers of jobs, file sizes and
fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has
12 cores, with cpu governance set to performance mode on all cores), 16GiB
of ram (the host has 64GiB) and using a NVMe device directly (without an
intermediary filesystem in the host). While running the tests, the host
was not used for anything else, to avoid disturbing the tests.
The obtained results were the following (the last line of fio's output was
pasted). Starting with 16 jobs is where a significant difference is
observable in this particular setup and hardware (differences highlighted
below). The very small differences for tests with less than 16 jobs are
possibly just noise and random.
**** 1 job, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=23.8MiB/s (24.9MB/s), 23.8MiB/s-23.8MiB/s (24.9MB/s-24.9MB/s), io=1024MiB (1074MB), run=43075-43075msec
after this change:
WRITE: bw=24.4MiB/s (25.6MB/s), 24.4MiB/s-24.4MiB/s (25.6MB/s-25.6MB/s), io=1024MiB (1074MB), run=41938-41938msec
**** 2 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54351-54351msec
after this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.6MiB/s-37.6MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54428-54428msec
**** 4 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=67.5MiB/s (70.8MB/s), 67.5MiB/s-67.5MiB/s (70.8MB/s-70.8MB/s), io=4096MiB (4295MB), run=60669-60669msec
after this change:
WRITE: bw=68.6MiB/s (71.0MB/s), 68.6MiB/s-68.6MiB/s (71.0MB/s-71.0MB/s), io=4096MiB (4295MB), run=59678-59678msec
**** 8 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=128MiB/s (134MB/s), 128MiB/s-128MiB/s (134MB/s-134MB/s), io=8192MiB (8590MB), run=64048-64048msec
after this change:
WRITE: bw=129MiB/s (135MB/s), 129MiB/s-129MiB/s (135MB/s-135MB/s), io=8192MiB (8590MB), run=63405-63405msec
**** 16 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=78.5MiB/s (82.3MB/s), 78.5MiB/s-78.5MiB/s (82.3MB/s-82.3MB/s), io=16.0GiB (17.2GB), run=208676-208676msec
after this change:
WRITE: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s), io=16.0GiB (17.2GB), run=149295-149295msec
(+40.1% throughput, -28.5% runtime)
**** 32 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=58.8MiB/s (61.7MB/s), 58.8MiB/s-58.8MiB/s (61.7MB/s-61.7MB/s), io=32.0GiB (34.4GB), run=557134-557134msec
after this change:
WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430550-430550msec
(+29.4% throughput, -22.7% runtime)
**** 64 jobs, file size 512M, fsync frequency 1 ****
before this change:
WRITE: bw=65.8MiB/s (68.0MB/s), 65.8MiB/s-65.8MiB/s (68.0MB/s-68.0MB/s), io=32.0GiB (34.4GB), run=498055-498055msec
after this change:
WRITE: bw=85.1MiB/s (89.2MB/s), 85.1MiB/s-85.1MiB/s (89.2MB/s-89.2MB/s), io=32.0GiB (34.4GB), run=385116-385116msec
(+29.3% throughput, -22.7% runtime)
**** 128 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=54.7MiB/s (57.3MB/s), 54.7MiB/s-54.7MiB/s (57.3MB/s-57.3MB/s), io=32.0GiB (34.4GB), run=599373-599373msec
after this change:
WRITE: bw=121MiB/s (126MB/s), 121MiB/s-121MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=271907-271907msec
(+121.2% throughput, -54.6% runtime)
**** 256 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=69.2MiB/s (72.5MB/s), 69.2MiB/s-69.2MiB/s (72.5MB/s-72.5MB/s), io=64.0GiB (68.7GB), run=947536-947536msec
after this change:
WRITE: bw=121MiB/s (127MB/s), 121MiB/s-121MiB/s (127MB/s-127MB/s), io=64.0GiB (68.7GB), run=541916-541916msec
(+74.9% throughput, -42.8% runtime)
**** 512 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=85.4MiB/s (89.5MB/s), 85.4MiB/s-85.4MiB/s (89.5MB/s-89.5MB/s), io=64.0GiB (68.7GB), run=767734-767734msec
after this change:
WRITE: bw=141MiB/s (147MB/s), 141MiB/s-141MiB/s (147MB/s-147MB/s), io=64.0GiB (68.7GB), run=466022-466022msec
(+65.1% throughput, -39.3% runtime)
**** 1024 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=115MiB/s (120MB/s), 115MiB/s-115MiB/s (120MB/s-120MB/s), io=128GiB (137GB), run=1143775-1143775msec
after this change:
WRITE: bw=171MiB/s (180MB/s), 171MiB/s-171MiB/s (180MB/s-180MB/s), io=128GiB (137GB), run=764843-764843msec
(+48.7% throughput, -33.1% runtime)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-15 12:30:43 +01:00
|
|
|
/*
|
|
|
|
|
* Same logic as for last_unlink_trans. We don't persist the generation
|
|
|
|
|
* of the last transaction where this inode was used for a reflink
|
|
|
|
|
* operation, so after eviction and reloading the inode we must be
|
|
|
|
|
* pessimistic and assume the last transaction that modified the inode.
|
|
|
|
|
*/
|
|
|
|
|
BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
|
|
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
path->slots[0]++;
|
|
|
|
|
if (inode->i_nlink != 1 ||
|
|
|
|
|
path->slots[0] >= btrfs_header_nritems(leaf))
|
|
|
|
|
goto cache_acl;
|
|
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
|
2017-01-10 20:35:31 +02:00
|
|
|
if (location.objectid != btrfs_ino(BTRFS_I(inode)))
|
2013-12-26 13:07:06 +08:00
|
|
|
goto cache_acl;
|
|
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
|
if (location.type == BTRFS_INODE_REF_KEY) {
|
|
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
|
|
|
|
|
|
ref = (struct btrfs_inode_ref *)ptr;
|
|
|
|
|
BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
|
|
|
|
|
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
|
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
|
|
|
|
|
|
extref = (struct btrfs_inode_extref *)ptr;
|
|
|
|
|
BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
|
|
|
|
|
extref);
|
|
|
|
|
}
|
2011-06-23 07:27:13 +00:00
|
|
|
cache_acl:
|
2009-04-27 11:47:50 -04:00
|
|
|
/*
|
|
|
|
|
* try to precache a NULL acl entry for files that don't have
|
|
|
|
|
* any xattrs or acls
|
|
|
|
|
*/
|
2011-04-20 10:31:50 +08:00
|
|
|
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
|
2017-01-20 14:54:07 +01:00
|
|
|
btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
if (first_xattr_slot != -1) {
|
|
|
|
|
path->slots[0] = first_xattr_slot;
|
|
|
|
|
ret = btrfs_load_inode_props(inode, path);
|
|
|
|
|
if (ret)
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_err(fs_info,
|
2014-05-15 16:48:20 +02:00
|
|
|
"error loading props for ino %llu (root %llu): %d",
|
2017-01-10 20:35:31 +02:00
|
|
|
btrfs_ino(BTRFS_I(inode)),
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
root->root_key.objectid, ret);
|
|
|
|
|
}
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
if (path != in_path)
|
|
|
|
|
btrfs_free_path(path);
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
|
2009-06-24 16:58:48 -04:00
|
|
|
if (!maybe_acls)
|
|
|
|
|
cache_no_acl(inode);
|
2009-04-27 11:47:50 -04:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
switch (inode->i_mode & S_IFMT) {
|
|
|
|
|
case S_IFREG:
|
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
|
break;
|
|
|
|
|
case S_IFDIR:
|
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
2017-01-25 17:06:38 -08:00
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
2007-06-12 06:35:45 -04:00
|
|
|
break;
|
|
|
|
|
case S_IFLNK:
|
|
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
2015-11-17 01:07:57 -05:00
|
|
|
inode_nohighmem(inode);
|
2018-09-24 15:16:55 -07:00
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2007-06-12 06:35:45 -04:00
|
|
|
break;
|
2007-07-11 10:18:17 -04:00
|
|
|
default:
|
2009-02-04 09:29:13 -05:00
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
2007-07-11 10:18:17 -04:00
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
|
break;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
2009-04-17 10:37:41 +02:00
|
|
|
|
2018-03-26 18:40:21 +02:00
|
|
|
btrfs_sync_inode_flags_to_i_flags(inode);
|
2016-06-06 11:51:25 +01:00
|
|
|
return 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* given a leaf and an inode, copy the inode fields into the leaf
|
|
|
|
|
*/
|
2008-09-05 16:13:11 -04:00
|
|
|
static void fill_inode_item(struct btrfs_trans_handle *trans,
|
|
|
|
|
struct extent_buffer *leaf,
|
2007-10-15 16:14:19 -04:00
|
|
|
struct btrfs_inode_item *item,
|
2007-06-12 06:35:45 -04:00
|
|
|
struct inode *inode)
|
|
|
|
|
{
|
2012-12-27 09:01:21 +00:00
|
|
|
struct btrfs_map_token token;
|
btrfs: add ro compat flags to inodes
Currently, inode flags are fully backwards incompatible in btrfs. If we
introduce a new inode flag, then tree-checker will detect it and fail.
This can even cause us to fail to mount entirely. To make it possible to
introduce new flags which can be read-only compatible, like VERITY, we
add new ro flags to btrfs without treating them quite so harshly in
tree-checker. A read-only file system can survive an unexpected flag,
and can be mounted.
As for the implementation, it unfortunately gets a little complicated.
The on-disk representation of the inode, btrfs_inode_item, has an __le64
for flags but the in-memory representation, btrfs_inode, uses a u32.
David Sterba had the nice idea that we could reclaim those wasted 32 bits
on disk and use them for the new ro_compat flags.
It turns out that the tree-checker code which checks for unknown flags
is broken, and ignores the upper 32 bits we are hoping to use. The issue
is that the flags use the literal 1 rather than 1ULL, so the flags are
signed ints, and one of them is specifically (1 << 31). As a result, the
mask which ORs the flags is a negative integer on machines where int is
32 bit twos complement. When tree-checker evaluates the expression:
btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)
The mask is something like 0x80000abc, which gets promoted to u64 with
sign extension to 0xffffffff80000abc. Negating that 64 bit mask leaves
all the upper bits zeroed, and we can't detect unexpected flags.
This suggests that we can't use those bits after all. Luckily, we have
good reason to believe that they are zero anyway. Inode flags are
metadata, which is always checksummed, so any bit flips that would
introduce 1s would cause a checksum failure anyway (excluding the
improbable case of the checksum getting corrupted exactly badly).
Further, unless the 1 << 31 flag is used, the cast to u64 of the 32 bit
inode flag should preserve its value and not add leading zeroes
(at least for twos complement). The only place that flag
(BTRFS_INODE_ROOT_ITEM_INIT) is used is in a special inode embedded in
the root item, and indeed for that inode we see 0xffffffff80000000 as
the flags on disk. However, that inode is never seen by tree checker,
nor is it used in a context where verity might be meaningful.
Theoretically, a future ro flag might cause trouble on that inode, so we
should proactively clean up that mess before it does.
With the introduction of the new ro flags, keep two separate unsigned
masks and check them against the appropriate u32. Since we no longer run
afoul of sign extension, this also stops writing out 0xffffffff80000000
in root_item inodes going forward.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:48 -07:00
|
|
|
u64 flags;
|
2012-12-27 09:01:21 +00:00
|
|
|
|
2019-08-09 17:48:21 +02:00
|
|
|
btrfs_init_map_token(&token, leaf);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2020-04-29 02:15:56 +02:00
|
|
|
btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
|
|
|
|
|
btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
|
|
|
|
|
btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
|
|
|
|
|
btrfs_set_token_inode_mode(&token, item, inode->i_mode);
|
|
|
|
|
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
|
|
|
|
|
|
|
|
|
|
btrfs_set_token_timespec_sec(&token, &item->atime,
|
|
|
|
|
inode->i_atime.tv_sec);
|
|
|
|
|
btrfs_set_token_timespec_nsec(&token, &item->atime,
|
|
|
|
|
inode->i_atime.tv_nsec);
|
|
|
|
|
|
|
|
|
|
btrfs_set_token_timespec_sec(&token, &item->mtime,
|
|
|
|
|
inode->i_mtime.tv_sec);
|
|
|
|
|
btrfs_set_token_timespec_nsec(&token, &item->mtime,
|
|
|
|
|
inode->i_mtime.tv_nsec);
|
|
|
|
|
|
|
|
|
|
btrfs_set_token_timespec_sec(&token, &item->ctime,
|
|
|
|
|
inode->i_ctime.tv_sec);
|
|
|
|
|
btrfs_set_token_timespec_nsec(&token, &item->ctime,
|
|
|
|
|
inode->i_ctime.tv_nsec);
|
|
|
|
|
|
|
|
|
|
btrfs_set_token_timespec_sec(&token, &item->otime,
|
|
|
|
|
BTRFS_I(inode)->i_otime.tv_sec);
|
|
|
|
|
btrfs_set_token_timespec_nsec(&token, &item->otime,
|
|
|
|
|
BTRFS_I(inode)->i_otime.tv_nsec);
|
|
|
|
|
|
|
|
|
|
btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
|
|
|
|
|
btrfs_set_token_inode_generation(&token, item,
|
|
|
|
|
BTRFS_I(inode)->generation);
|
|
|
|
|
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
|
|
|
|
|
btrfs_set_token_inode_transid(&token, item, trans->transid);
|
|
|
|
|
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
|
btrfs: add ro compat flags to inodes
Currently, inode flags are fully backwards incompatible in btrfs. If we
introduce a new inode flag, then tree-checker will detect it and fail.
This can even cause us to fail to mount entirely. To make it possible to
introduce new flags which can be read-only compatible, like VERITY, we
add new ro flags to btrfs without treating them quite so harshly in
tree-checker. A read-only file system can survive an unexpected flag,
and can be mounted.
As for the implementation, it unfortunately gets a little complicated.
The on-disk representation of the inode, btrfs_inode_item, has an __le64
for flags but the in-memory representation, btrfs_inode, uses a u32.
David Sterba had the nice idea that we could reclaim those wasted 32 bits
on disk and use them for the new ro_compat flags.
It turns out that the tree-checker code which checks for unknown flags
is broken, and ignores the upper 32 bits we are hoping to use. The issue
is that the flags use the literal 1 rather than 1ULL, so the flags are
signed ints, and one of them is specifically (1 << 31). As a result, the
mask which ORs the flags is a negative integer on machines where int is
32 bit twos complement. When tree-checker evaluates the expression:
btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)
The mask is something like 0x80000abc, which gets promoted to u64 with
sign extension to 0xffffffff80000abc. Negating that 64 bit mask leaves
all the upper bits zeroed, and we can't detect unexpected flags.
This suggests that we can't use those bits after all. Luckily, we have
good reason to believe that they are zero anyway. Inode flags are
metadata, which is always checksummed, so any bit flips that would
introduce 1s would cause a checksum failure anyway (excluding the
improbable case of the checksum getting corrupted exactly badly).
Further, unless the 1 << 31 flag is used, the cast to u64 of the 32 bit
inode flag should preserve its value and not add leading zeroes
(at least for twos complement). The only place that flag
(BTRFS_INODE_ROOT_ITEM_INIT) is used is in a special inode embedded in
the root item, and indeed for that inode we see 0xffffffff80000000 as
the flags on disk. However, that inode is never seen by tree checker,
nor is it used in a context where verity might be meaningful.
Theoretically, a future ro flag might cause trouble on that inode, so we
should proactively clean up that mess before it does.
With the introduction of the new ro flags, keep two separate unsigned
masks and check them against the appropriate u32. Since we no longer run
afoul of sign extension, this also stops writing out 0xffffffff80000000
in root_item inodes going forward.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:48 -07:00
|
|
|
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
|
|
|
|
|
BTRFS_I(inode)->ro_flags);
|
|
|
|
|
btrfs_set_token_inode_flags(&token, item, flags);
|
2020-04-29 02:15:56 +02:00
|
|
|
btrfs_set_token_inode_block_group(&token, item, 0);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
|
*/
|
2011-11-10 20:39:08 -05:00
|
|
|
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
|
2020-11-02 16:48:58 +02:00
|
|
|
struct btrfs_root *root,
|
|
|
|
|
struct btrfs_inode *inode)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
|
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
|
struct btrfs_path *path;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
2020-11-02 16:48:58 +02:00
|
|
|
ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (ret) {
|
|
|
|
|
if (ret > 0)
|
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
goto failed;
|
|
|
|
|
}
|
|
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
struct btrfs_inode_item);
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2020-11-02 16:48:58 +02:00
|
|
|
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
|
2007-10-15 16:14:19 -04:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2020-11-02 16:48:58 +02:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
ret = 0;
|
|
|
|
|
failed:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-10 20:39:08 -05:00
|
|
|
/*
|
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
|
*/
|
|
|
|
|
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
|
2020-11-02 16:48:59 +02:00
|
|
|
struct btrfs_root *root,
|
|
|
|
|
struct btrfs_inode *inode)
|
2011-11-10 20:39:08 -05:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2011-11-10 20:39:08 -05:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the inode is a free space inode, we can deadlock during commit
|
|
|
|
|
* if we put it into the delayed code.
|
|
|
|
|
*
|
|
|
|
|
* The data relocation inode should also be directly updated
|
|
|
|
|
* without delay
|
|
|
|
|
*/
|
2020-11-02 16:48:59 +02:00
|
|
|
if (!btrfs_is_free_space_inode(inode)
|
2021-09-09 01:19:25 +09:00
|
|
|
&& !btrfs_is_data_reloc_root(root)
|
2016-06-22 18:54:23 -04:00
|
|
|
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
|
2012-07-25 17:35:53 +02:00
|
|
|
btrfs_update_root_times(trans, root);
|
|
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_delayed_update_inode(trans, root, inode);
|
2011-11-10 20:39:08 -05:00
|
|
|
if (!ret)
|
2020-11-02 16:48:59 +02:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
2011-11-10 20:39:08 -05:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
return btrfs_update_inode_item(trans, root, inode);
|
2011-11-10 20:39:08 -05:00
|
|
|
}
|
|
|
|
|
|
2020-11-02 16:49:06 +02:00
|
|
|
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
|
|
|
|
|
struct btrfs_root *root, struct btrfs_inode *inode)
|
2011-11-10 20:39:08 -05:00
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
2020-11-02 16:49:06 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2011-11-10 20:39:08 -05:00
|
|
|
if (ret == -ENOSPC)
|
2020-11-02 16:49:06 +02:00
|
|
|
return btrfs_update_inode_item(trans, root, inode);
|
2011-11-10 20:39:08 -05:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* unlink helper that gets used here in inode.c and in the tree logging
|
|
|
|
|
* recovery code. It remove a link in a directory with a given name, and
|
|
|
|
|
* also drops the back refs in the inode to the directory
|
|
|
|
|
*/
|
2011-03-04 17:14:37 +00:00
|
|
|
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
2017-01-18 00:31:44 +02:00
|
|
|
struct btrfs_inode *dir,
|
|
|
|
|
struct btrfs_inode *inode,
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
const char *name, int name_len,
|
|
|
|
|
struct btrfs_rename_ctx *rename_ctx)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2021-10-25 17:31:50 +01:00
|
|
|
struct btrfs_root *root = dir->root;
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_path *path;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
struct btrfs_dir_item *di;
|
2008-07-24 12:12:38 -04:00
|
|
|
u64 index;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2007-06-22 14:16:25 -04:00
|
|
|
if (!path) {
|
|
|
|
|
ret = -ENOMEM;
|
2011-02-03 03:16:25 +00:00
|
|
|
goto out;
|
2007-06-22 14:16:25 -04:00
|
|
|
}
|
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2007-06-12 06:35:45 -04:00
|
|
|
name, name_len, -1);
|
2018-09-12 06:06:26 +08:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
|
ret = di ? PTR_ERR(di) : -ENOENT;
|
2007-06-12 06:35:45 -04:00
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2007-06-22 14:16:25 -04:00
|
|
|
if (ret)
|
|
|
|
|
goto err;
|
2011-04-21 01:20:15 +02:00
|
|
|
btrfs_release_path(path);
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
/*
|
|
|
|
|
* If we don't have dir index, we have to get it by looking up
|
|
|
|
|
* the inode ref, since we get the inode ref, remove it directly,
|
|
|
|
|
* it is unnecessary to do delayed deletion.
|
|
|
|
|
*
|
|
|
|
|
* But if we have dir index, needn't search inode ref to get it.
|
|
|
|
|
* Since the inode ref is close to the inode item, it is better
|
|
|
|
|
* that we delay to delete it, and just do this deletion when
|
|
|
|
|
* we update the inode item.
|
|
|
|
|
*/
|
2017-01-18 00:31:44 +02:00
|
|
|
if (inode->dir_index) {
|
2013-12-26 13:07:06 +08:00
|
|
|
ret = btrfs_delayed_delete_inode_ref(inode);
|
|
|
|
|
if (!ret) {
|
2017-01-18 00:31:44 +02:00
|
|
|
index = inode->dir_index;
|
2013-12-26 13:07:06 +08:00
|
|
|
goto skip_backref;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
|
dir_ino, &index);
|
2008-07-24 12:12:38 -04:00
|
|
|
if (ret) {
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_info(fs_info,
|
2013-03-19 22:41:23 +00:00
|
|
|
"failed to delete reference to %.*s, inode %llu parent %llu",
|
2013-08-20 13:20:07 +02:00
|
|
|
name_len, name, ino, dir_ino);
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2008-07-24 12:12:38 -04:00
|
|
|
goto err;
|
|
|
|
|
}
|
2013-12-26 13:07:06 +08:00
|
|
|
skip_backref:
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
if (rename_ctx)
|
|
|
|
|
rename_ctx->index = index;
|
|
|
|
|
|
2018-08-01 11:32:26 +08:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2007-06-12 06:35:45 -04:00
|
|
|
goto err;
|
2012-03-12 16:03:00 +01:00
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
btrfs: stop doing unnecessary log updates during a rename
During a rename, we call __btrfs_unlink_inode(), which will call
btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(), in order
to remove an inode reference and a directory entry from the log. These
are necessary when __btrfs_unlink_inode() is called from the unlink path,
but not necessary when it's called from a rename context, because:
1) For the btrfs_del_inode_ref_in_log() call, it's pointless to delete the
inode reference related to the old name, because later in the rename
path we call btrfs_log_new_name(), which will drop all inode references
from the log and copy all inode references from the subvolume tree to
the log tree. So we are doing one unnecessary btree operation which
adds additional latency and lock contention in case there are other
tasks accessing the log tree;
2) For the btrfs_del_dir_entries_in_log() call, we are now doing the
equivalent at btrfs_log_new_name() since the previous patch in the
series, that has the subject "btrfs: avoid logging all directory
changes during renames". In fact, having __btrfs_unlink_inode() call
this function not only adds additional latency and lock contention due
to the extra btree operation, but also can make btrfs_log_new_name()
unnecessarily log a range item to track the deletion of the old name,
since it has no way to known that the directory entry related to the
old name was previously logged and already deleted by
__btrfs_unlink_inode() through its call to
btrfs_del_dir_entries_in_log().
So skip those calls at __btrfs_unlink_inode() when we are doing a rename.
Skipping them also allows us now to reduce the duration of time we are
pinning a log transaction during renames, which is always beneficial as
it's not delaying so much other tasks trying to sync the log tree, in
particular we end up not holding the log transaction pinned while adding
the new name (adding inode ref, directory entry, etc).
This change is part of a patchset comprised of the following patches:
1/5 btrfs: add helper to delete a dir entry from a log tree
2/5 btrfs: pass the dentry to btrfs_log_new_name() instead of the inode
3/5 btrfs: avoid logging all directory changes during renames
4/5 btrfs: stop doing unnecessary log updates during a rename
5/5 btrfs: avoid inode logging during rename and link when possible
Just like the previous patch in the series, "btrfs: avoid logging all
directory changes during renames", the following script mimics part of
what a package installation/upgrade with zypper does, which is basically
renaming a lot of files, in some directory under /usr, to a name with a
suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box a using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before patchset: 27399 ms
NUM_FILES=10000, after patches 1/5 to 3/5 applied: 9093 ms (-66.8%)
NUM_FILES=10000, after patches 1/5 to 4/5 applied: 9016 ms (-67.1%)
NUM_FILES=5000, before patchset: 9241 ms
NUM_FILES=5000, after patches 1/5 to 3/5 applied: 4642 ms (-49.8%)
NUM_FILES=5000, after patches 1/5 to 4/5 applied: 4553 ms (-50.7%)
NUM_FILES=2000, before patchset: 2550 ms
NUM_FILES=2000, after patches 1/5 to 3/5 applied: 1788 ms (-29.9%)
NUM_FILES=2000, after patches 1/5 to 4/5 applied: 1767 ms (-30.7%)
NUM_FILES=1000, before patchset: 1088 ms
NUM_FILES=1000, after patches 1/5 to 3/5 applied: 905 ms (-16.9%)
NUM_FILES=1000, after patches 1/5 to 4/5 applied: 883 ms (-18.8%)
The next patch in the series (5/5), also contains dbench results after
applying to whole patchset.
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:09 +00:00
|
|
|
/*
|
|
|
|
|
* If we are in a rename context, we don't need to update anything in the
|
|
|
|
|
* log. That will be done later during the rename by btrfs_log_new_name().
|
2022-05-25 16:27:25 +02:00
|
|
|
* Besides that, doing it here would only cause extra unnecessary btree
|
btrfs: stop doing unnecessary log updates during a rename
During a rename, we call __btrfs_unlink_inode(), which will call
btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(), in order
to remove an inode reference and a directory entry from the log. These
are necessary when __btrfs_unlink_inode() is called from the unlink path,
but not necessary when it's called from a rename context, because:
1) For the btrfs_del_inode_ref_in_log() call, it's pointless to delete the
inode reference related to the old name, because later in the rename
path we call btrfs_log_new_name(), which will drop all inode references
from the log and copy all inode references from the subvolume tree to
the log tree. So we are doing one unnecessary btree operation which
adds additional latency and lock contention in case there are other
tasks accessing the log tree;
2) For the btrfs_del_dir_entries_in_log() call, we are now doing the
equivalent at btrfs_log_new_name() since the previous patch in the
series, that has the subject "btrfs: avoid logging all directory
changes during renames". In fact, having __btrfs_unlink_inode() call
this function not only adds additional latency and lock contention due
to the extra btree operation, but also can make btrfs_log_new_name()
unnecessarily log a range item to track the deletion of the old name,
since it has no way to known that the directory entry related to the
old name was previously logged and already deleted by
__btrfs_unlink_inode() through its call to
btrfs_del_dir_entries_in_log().
So skip those calls at __btrfs_unlink_inode() when we are doing a rename.
Skipping them also allows us now to reduce the duration of time we are
pinning a log transaction during renames, which is always beneficial as
it's not delaying so much other tasks trying to sync the log tree, in
particular we end up not holding the log transaction pinned while adding
the new name (adding inode ref, directory entry, etc).
This change is part of a patchset comprised of the following patches:
1/5 btrfs: add helper to delete a dir entry from a log tree
2/5 btrfs: pass the dentry to btrfs_log_new_name() instead of the inode
3/5 btrfs: avoid logging all directory changes during renames
4/5 btrfs: stop doing unnecessary log updates during a rename
5/5 btrfs: avoid inode logging during rename and link when possible
Just like the previous patch in the series, "btrfs: avoid logging all
directory changes during renames", the following script mimics part of
what a package installation/upgrade with zypper does, which is basically
renaming a lot of files, in some directory under /usr, to a name with a
suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box a using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before patchset: 27399 ms
NUM_FILES=10000, after patches 1/5 to 3/5 applied: 9093 ms (-66.8%)
NUM_FILES=10000, after patches 1/5 to 4/5 applied: 9016 ms (-67.1%)
NUM_FILES=5000, before patchset: 9241 ms
NUM_FILES=5000, after patches 1/5 to 3/5 applied: 4642 ms (-49.8%)
NUM_FILES=5000, after patches 1/5 to 4/5 applied: 4553 ms (-50.7%)
NUM_FILES=2000, before patchset: 2550 ms
NUM_FILES=2000, after patches 1/5 to 3/5 applied: 1788 ms (-29.9%)
NUM_FILES=2000, after patches 1/5 to 4/5 applied: 1767 ms (-30.7%)
NUM_FILES=1000, before patchset: 1088 ms
NUM_FILES=1000, after patches 1/5 to 3/5 applied: 905 ms (-16.9%)
NUM_FILES=1000, after patches 1/5 to 4/5 applied: 883 ms (-18.8%)
The next patch in the series (5/5), also contains dbench results after
applying to whole patchset.
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:09 +00:00
|
|
|
* operations on the log tree, increasing latency for applications.
|
|
|
|
|
*/
|
|
|
|
|
if (!rename_ctx) {
|
|
|
|
|
btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
|
|
|
|
|
dir_ino);
|
|
|
|
|
btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
|
|
|
|
|
index);
|
|
|
|
|
}
|
2019-06-18 10:59:18 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we have a pending delayed iput we could end up with the final iput
|
|
|
|
|
* being run in btrfs-cleaner context. If we have enough of these built
|
|
|
|
|
* up we can end up burning a lot of time in btrfs-cleaner without any
|
|
|
|
|
* way to throttle the unlinks. Since we're currently holding a ref on
|
|
|
|
|
* the inode we can run the delayed iput here without any issues as the
|
|
|
|
|
* final iput won't be done until after we drop the ref we're currently
|
|
|
|
|
* holding.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_run_delayed_iput(fs_info, inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
err:
|
|
|
|
|
btrfs_free_path(path);
|
2008-09-05 16:13:11 -04:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
|
2017-02-20 13:50:34 +02:00
|
|
|
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
|
2017-01-18 00:31:44 +02:00
|
|
|
inode_inc_iversion(&inode->vfs_inode);
|
|
|
|
|
inode_inc_iversion(&dir->vfs_inode);
|
2022-06-21 18:40:48 +02:00
|
|
|
inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
|
|
|
|
|
dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
|
|
|
|
|
dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, dir);
|
2008-09-05 16:13:11 -04:00
|
|
|
out:
|
2007-06-12 06:35:45 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2011-03-04 17:14:37 +00:00
|
|
|
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
2017-01-18 00:31:44 +02:00
|
|
|
struct btrfs_inode *dir, struct btrfs_inode *inode,
|
2011-03-04 17:14:37 +00:00
|
|
|
const char *name, int name_len)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
|
2011-03-04 17:14:37 +00:00
|
|
|
if (!ret) {
|
2017-01-18 00:31:44 +02:00
|
|
|
drop_nlink(&inode->vfs_inode);
|
2021-10-25 17:31:50 +01:00
|
|
|
ret = btrfs_update_inode(trans, inode->root, inode);
|
2011-03-04 17:14:37 +00:00
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2010-05-16 10:48:46 -04:00
|
|
|
/*
|
|
|
|
|
* helper to start transaction for unlink and rmdir.
|
|
|
|
|
*
|
2013-05-29 14:54:47 -04:00
|
|
|
* unlink and rmdir are special in btrfs, they do not always free space, so
|
|
|
|
|
* if we cannot make our reservations the normal way try and see if there is
|
|
|
|
|
* plenty of slack room in the global reserve to migrate, otherwise we cannot
|
|
|
|
|
* allow the unlink to occur.
|
2010-05-16 10:48:46 -04:00
|
|
|
*/
|
2013-05-29 14:54:47 -04:00
|
|
|
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
|
2009-09-21 15:56:00 -04:00
|
|
|
{
|
2010-05-16 10:48:46 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2009-09-21 15:56:00 -04:00
|
|
|
|
2011-10-11 14:18:24 -04:00
|
|
|
/*
|
|
|
|
|
* 1 for the possible orphan item
|
|
|
|
|
* 1 for the dir item
|
|
|
|
|
* 1 for the dir index
|
|
|
|
|
* 1 for the inode ref
|
|
|
|
|
* 1 for the inode
|
2022-03-09 17:31:31 -08:00
|
|
|
* 1 for the parent inode
|
2011-10-11 14:18:24 -04:00
|
|
|
*/
|
2022-03-09 17:31:31 -08:00
|
|
|
return btrfs_start_transaction_fallback_global_rsv(root, 6);
|
2010-05-16 10:48:46 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_trans_handle *trans;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2010-05-16 10:48:46 -04:00
|
|
|
int ret;
|
|
|
|
|
|
2013-05-29 14:54:47 -04:00
|
|
|
trans = __unlink_start_trans(dir);
|
2010-05-16 10:48:46 -04:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
|
return PTR_ERR(trans);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2017-01-18 00:31:44 +02:00
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
|
|
|
|
|
0);
|
2009-03-24 10:24:20 -04:00
|
|
|
|
2021-10-25 17:31:50 +01:00
|
|
|
ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
|
2017-01-18 00:31:44 +02:00
|
|
|
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
|
|
|
|
|
dentry->d_name.len);
|
2011-07-19 07:27:20 +00:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2010-05-16 10:48:46 -04:00
|
|
|
if (inode->i_nlink == 0) {
|
2017-02-20 13:50:59 +02:00
|
|
|
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
2011-07-19 07:27:20 +00:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
2010-05-16 10:48:46 -04:00
|
|
|
}
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2011-07-19 07:27:20 +00:00
|
|
|
out:
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2021-10-25 17:31:50 +01:00
|
|
|
btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
|
2007-06-12 06:35:45 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-18 11:34:52 +09:00
|
|
|
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
|
2019-12-18 17:20:27 -05:00
|
|
|
struct inode *dir, struct dentry *dentry)
|
2009-09-21 15:56:00 -04:00
|
|
|
{
|
2018-08-01 11:32:30 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2019-12-18 17:20:27 -05:00
|
|
|
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
|
2009-09-21 15:56:00 -04:00
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
|
struct btrfs_key key;
|
2019-12-18 17:20:27 -05:00
|
|
|
const char *name = dentry->d_name.name;
|
|
|
|
|
int name_len = dentry->d_name.len;
|
2009-09-21 15:56:00 -04:00
|
|
|
u64 index;
|
|
|
|
|
int ret;
|
2019-12-18 17:20:27 -05:00
|
|
|
u64 objectid;
|
2017-01-10 20:35:31 +02:00
|
|
|
u64 dir_ino = btrfs_ino(BTRFS_I(dir));
|
2009-09-21 15:56:00 -04:00
|
|
|
|
2019-12-18 17:20:27 -05:00
|
|
|
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
|
objectid = inode->root->root_key.objectid;
|
|
|
|
|
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
|
|
|
|
|
objectid = inode->location.objectid;
|
|
|
|
|
} else {
|
|
|
|
|
WARN_ON(1);
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2009-09-21 15:56:00 -04:00
|
|
|
name, name_len, -1);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
2018-09-12 06:06:26 +08:00
|
|
|
ret = di ? PTR_ERR(di) : -ENOENT;
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out;
|
|
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
|
|
|
|
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
|
|
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out;
|
|
|
|
|
}
|
2011-04-21 01:20:15 +02:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 15:56:00 -04:00
|
|
|
|
2019-12-18 17:20:28 -05:00
|
|
|
/*
|
|
|
|
|
* This is a placeholder inode for a subvolume we didn't have a
|
|
|
|
|
* reference to at the time of the snapshot creation. In the meantime
|
|
|
|
|
* we could have renamed the real subvol link into our snapshot, so
|
2021-05-21 17:42:23 +02:00
|
|
|
* depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
|
2019-12-18 17:20:28 -05:00
|
|
|
* Instead simply lookup the dir_index_item for this entry so we can
|
|
|
|
|
* remove it. Otherwise we know we have a ref to the root and we can
|
|
|
|
|
* call btrfs_del_root_ref, and it _shouldn't_ fail.
|
|
|
|
|
*/
|
|
|
|
|
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_search_dir_index_item(root, path, dir_ino,
|
2009-09-21 15:56:00 -04:00
|
|
|
name, name_len);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
|
if (!di)
|
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
else
|
|
|
|
|
ret = PTR_ERR(di);
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out;
|
|
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
|
|
|
|
index = key.offset;
|
2019-12-18 17:20:28 -05:00
|
|
|
btrfs_release_path(path);
|
|
|
|
|
} else {
|
|
|
|
|
ret = btrfs_del_root_ref(trans, objectid,
|
|
|
|
|
root->root_key.objectid, dir_ino,
|
|
|
|
|
&index, name, name_len);
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
}
|
|
|
|
|
|
2018-08-01 11:32:26 +08:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out;
|
|
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
|
2017-02-20 13:50:34 +02:00
|
|
|
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
|
2012-04-05 15:03:02 -04:00
|
|
|
inode_inc_iversion(dir);
|
2022-06-21 18:40:48 +02:00
|
|
|
dir->i_mtime = current_time(dir);
|
|
|
|
|
dir->i_ctime = dir->i_mtime;
|
2020-11-02 16:49:06 +02:00
|
|
|
ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret)
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
out:
|
2011-06-14 14:24:32 -04:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 16:03:00 +01:00
|
|
|
return ret;
|
2009-09-21 15:56:00 -04:00
|
|
|
}
|
|
|
|
|
|
2018-04-18 11:34:13 +09:00
|
|
|
/*
|
|
|
|
|
* Helper to check if the subvolume references other subvolumes or if it's
|
|
|
|
|
* default.
|
|
|
|
|
*/
|
2018-04-18 11:34:52 +09:00
|
|
|
static noinline int may_destroy_subvol(struct btrfs_root *root)
|
2018-04-18 11:34:13 +09:00
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
u64 dir_id;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
|
|
/* Make sure this root isn't set as the default subvol */
|
|
|
|
|
dir_id = btrfs_super_root_dir(fs_info->super_copy);
|
|
|
|
|
di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
|
|
|
|
|
dir_id, "default", 7, 0);
|
|
|
|
|
if (di && !IS_ERR(di)) {
|
|
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
|
|
|
|
|
if (key.objectid == root->root_key.objectid) {
|
|
|
|
|
ret = -EPERM;
|
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
|
"deleting default subvolume %llu is not allowed",
|
|
|
|
|
key.objectid);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
key.objectid = root->root_key.objectid;
|
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
goto out;
|
|
|
|
|
BUG_ON(ret == 0);
|
|
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
if (path->slots[0] > 0) {
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
|
|
|
|
if (key.objectid == root->root_key.objectid &&
|
|
|
|
|
key.type == BTRFS_ROOT_REF_KEY)
|
|
|
|
|
ret = -ENOTEMPTY;
|
|
|
|
|
}
|
|
|
|
|
out:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-27 14:36:24 +03:00
|
|
|
/* Delete all dentries for inodes belonging to the root */
|
|
|
|
|
static void btrfs_prune_dentries(struct btrfs_root *root)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
struct rb_node *node;
|
|
|
|
|
struct rb_node *prev;
|
|
|
|
|
struct btrfs_inode *entry;
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
u64 objectid = 0;
|
|
|
|
|
|
2021-10-05 16:35:25 -04:00
|
|
|
if (!BTRFS_FS_ERROR(fs_info))
|
2018-04-27 14:36:24 +03:00
|
|
|
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
|
|
|
|
|
|
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
|
again:
|
|
|
|
|
node = root->inode_tree.rb_node;
|
|
|
|
|
prev = NULL;
|
|
|
|
|
while (node) {
|
|
|
|
|
prev = node;
|
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
|
|
|
|
|
2018-06-29 10:56:40 +02:00
|
|
|
if (objectid < btrfs_ino(entry))
|
2018-04-27 14:36:24 +03:00
|
|
|
node = node->rb_left;
|
2018-06-29 10:56:40 +02:00
|
|
|
else if (objectid > btrfs_ino(entry))
|
2018-04-27 14:36:24 +03:00
|
|
|
node = node->rb_right;
|
|
|
|
|
else
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (!node) {
|
|
|
|
|
while (prev) {
|
|
|
|
|
entry = rb_entry(prev, struct btrfs_inode, rb_node);
|
2018-06-29 10:56:40 +02:00
|
|
|
if (objectid <= btrfs_ino(entry)) {
|
2018-04-27 14:36:24 +03:00
|
|
|
node = prev;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
prev = rb_next(prev);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
while (node) {
|
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
2018-06-29 10:56:40 +02:00
|
|
|
objectid = btrfs_ino(entry) + 1;
|
2018-04-27 14:36:24 +03:00
|
|
|
inode = igrab(&entry->vfs_inode);
|
|
|
|
|
if (inode) {
|
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
|
if (atomic_read(&inode->i_count) > 1)
|
|
|
|
|
d_prune_aliases(inode);
|
|
|
|
|
/*
|
|
|
|
|
* btrfs_drop_inode will have it removed from the inode
|
|
|
|
|
* cache when its usage count hits zero.
|
|
|
|
|
*/
|
|
|
|
|
iput(inode);
|
|
|
|
|
cond_resched();
|
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
|
goto again;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (cond_resched_lock(&root->inode_lock))
|
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
|
|
node = rb_next(node);
|
|
|
|
|
}
|
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-18 11:34:52 +09:00
|
|
|
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
|
struct inode *inode = d_inode(dentry);
|
|
|
|
|
struct btrfs_root *dest = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
struct btrfs_block_rsv block_rsv;
|
|
|
|
|
u64 root_flags;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Don't allow to delete a subvolume with send in progress. This is
|
|
|
|
|
* inside the inode lock so the error handling that has to drop the bit
|
|
|
|
|
* again is not run concurrently.
|
|
|
|
|
*/
|
|
|
|
|
spin_lock(&dest->root_item_lock);
|
2018-08-04 21:10:53 +08:00
|
|
|
if (dest->send_in_progress) {
|
2018-04-18 11:34:52 +09:00
|
|
|
spin_unlock(&dest->root_item_lock);
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"attempt to delete subvolume %llu during send",
|
|
|
|
|
dest->root_key.objectid);
|
|
|
|
|
return -EPERM;
|
|
|
|
|
}
|
2022-03-23 15:10:32 +08:00
|
|
|
if (atomic_read(&dest->nr_swapfiles)) {
|
|
|
|
|
spin_unlock(&dest->root_item_lock);
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"attempt to delete subvolume %llu with active swapfile",
|
|
|
|
|
root->root_key.objectid);
|
|
|
|
|
return -EPERM;
|
|
|
|
|
}
|
2018-08-04 21:10:53 +08:00
|
|
|
root_flags = btrfs_root_flags(&dest->root_item);
|
|
|
|
|
btrfs_set_root_flags(&dest->root_item,
|
|
|
|
|
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
|
|
|
|
|
spin_unlock(&dest->root_item_lock);
|
2018-04-18 11:34:52 +09:00
|
|
|
|
|
|
|
|
down_write(&fs_info->subvol_sem);
|
|
|
|
|
|
2020-11-24 17:49:30 +02:00
|
|
|
ret = may_destroy_subvol(dest);
|
|
|
|
|
if (ret)
|
2018-04-18 11:34:52 +09:00
|
|
|
goto out_up_write;
|
|
|
|
|
|
|
|
|
|
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
|
|
|
|
|
/*
|
|
|
|
|
* One for dir inode,
|
|
|
|
|
* two for dir entries,
|
|
|
|
|
* two for root ref/backref.
|
|
|
|
|
*/
|
2020-11-24 17:49:30 +02:00
|
|
|
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
|
|
|
|
|
if (ret)
|
2018-04-18 11:34:52 +09:00
|
|
|
goto out_up_write;
|
|
|
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
|
if (IS_ERR(trans)) {
|
2020-11-24 17:49:30 +02:00
|
|
|
ret = PTR_ERR(trans);
|
2018-04-18 11:34:52 +09:00
|
|
|
goto out_release;
|
|
|
|
|
}
|
|
|
|
|
trans->block_rsv = &block_rsv;
|
|
|
|
|
trans->bytes_reserved = block_rsv.size;
|
|
|
|
|
|
|
|
|
|
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
|
|
|
|
|
|
2019-12-18 17:20:27 -05:00
|
|
|
ret = btrfs_unlink_subvol(trans, dir, dentry);
|
2018-04-18 11:34:52 +09:00
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out_end_trans;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-12 15:25:04 -05:00
|
|
|
ret = btrfs_record_root_in_trans(trans, dest);
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out_end_trans;
|
|
|
|
|
}
|
2018-04-18 11:34:52 +09:00
|
|
|
|
|
|
|
|
memset(&dest->root_item.drop_progress, 0,
|
|
|
|
|
sizeof(dest->root_item.drop_progress));
|
2020-09-15 21:44:52 +02:00
|
|
|
btrfs_set_root_drop_level(&dest->root_item, 0);
|
2018-04-18 11:34:52 +09:00
|
|
|
btrfs_set_root_refs(&dest->root_item, 0);
|
|
|
|
|
|
|
|
|
|
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
|
|
|
|
|
ret = btrfs_insert_orphan_item(trans,
|
|
|
|
|
fs_info->tree_root,
|
|
|
|
|
dest->root_key.objectid);
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out_end_trans;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-29 15:01:54 +08:00
|
|
|
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
|
2018-04-18 11:34:52 +09:00
|
|
|
BTRFS_UUID_KEY_SUBVOL,
|
|
|
|
|
dest->root_key.objectid);
|
|
|
|
|
if (ret && ret != -ENOENT) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out_end_trans;
|
|
|
|
|
}
|
|
|
|
|
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
|
2018-05-29 15:01:54 +08:00
|
|
|
ret = btrfs_uuid_tree_remove(trans,
|
2018-04-18 11:34:52 +09:00
|
|
|
dest->root_item.received_uuid,
|
|
|
|
|
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
|
|
|
|
|
dest->root_key.objectid);
|
|
|
|
|
if (ret && ret != -ENOENT) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto out_end_trans;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-16 10:17:37 +08:00
|
|
|
free_anon_bdev(dest->anon_dev);
|
|
|
|
|
dest->anon_dev = 0;
|
2018-04-18 11:34:52 +09:00
|
|
|
out_end_trans:
|
|
|
|
|
trans->block_rsv = NULL;
|
|
|
|
|
trans->bytes_reserved = 0;
|
|
|
|
|
ret = btrfs_end_transaction(trans);
|
|
|
|
|
inode->i_flags |= S_DEAD;
|
|
|
|
|
out_release:
|
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
|
|
|
btrfs_subvolume_release_metadata(root, &block_rsv);
|
2018-04-18 11:34:52 +09:00
|
|
|
out_up_write:
|
|
|
|
|
up_write(&fs_info->subvol_sem);
|
2020-11-24 17:49:30 +02:00
|
|
|
if (ret) {
|
2018-04-18 11:34:52 +09:00
|
|
|
spin_lock(&dest->root_item_lock);
|
|
|
|
|
root_flags = btrfs_root_flags(&dest->root_item);
|
|
|
|
|
btrfs_set_root_flags(&dest->root_item,
|
|
|
|
|
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
|
|
|
|
|
spin_unlock(&dest->root_item_lock);
|
|
|
|
|
} else {
|
|
|
|
|
d_invalidate(dentry);
|
2018-04-27 14:36:24 +03:00
|
|
|
btrfs_prune_dentries(dest);
|
2018-04-18 11:34:52 +09:00
|
|
|
ASSERT(dest->send_in_progress == 0);
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-24 17:49:30 +02:00
|
|
|
return ret;
|
2018-04-18 11:34:52 +09:00
|
|
|
}
|
|
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
|
|
|
|
|
{
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2021-12-15 15:40:03 -05:00
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
2007-12-21 16:27:21 -05:00
|
|
|
int err = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
2016-06-06 16:11:13 +01:00
|
|
|
u64 last_unlink_trans;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2012-09-13 16:04:34 -06:00
|
|
|
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
2007-10-25 15:49:25 -04:00
|
|
|
return -ENOTEMPTY;
|
2021-12-15 15:40:03 -05:00
|
|
|
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
|
if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
|
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
|
"extent tree v2 doesn't support snapshot deletion yet");
|
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
}
|
2018-04-18 11:35:31 +09:00
|
|
|
return btrfs_delete_subvolume(dir, dentry);
|
2021-12-15 15:40:03 -05:00
|
|
|
}
|
2007-10-25 15:49:25 -04:00
|
|
|
|
2013-05-29 14:54:47 -04:00
|
|
|
trans = __unlink_start_trans(dir);
|
2010-05-16 10:48:46 -04:00
|
|
|
if (IS_ERR(trans))
|
2009-11-10 21:23:48 -05:00
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
2017-01-10 20:35:31 +02:00
|
|
|
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
2019-12-18 17:20:27 -05:00
|
|
|
err = btrfs_unlink_subvol(trans, dir, dentry);
|
2009-09-21 15:56:00 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 13:50:59 +02:00
|
|
|
err = btrfs_orphan_add(trans, BTRFS_I(inode));
|
2008-07-24 12:17:14 -04:00
|
|
|
if (err)
|
2009-09-21 15:56:00 -04:00
|
|
|
goto out;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2016-06-06 16:11:13 +01:00
|
|
|
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
|
|
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
/* now the directory is empty */
|
2021-10-25 17:31:50 +01:00
|
|
|
err = btrfs_unlink_inode(trans, BTRFS_I(dir),
|
2017-01-18 00:31:44 +02:00
|
|
|
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
|
|
|
|
|
dentry->d_name.len);
|
2016-06-06 16:11:13 +01:00
|
|
|
if (!err) {
|
2017-02-20 13:50:34 +02:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), 0);
|
2016-06-06 16:11:13 +01:00
|
|
|
/*
|
|
|
|
|
* Propagate the last_unlink_trans value of the deleted dir to
|
|
|
|
|
* its parent directory. This is to prevent an unrecoverable
|
|
|
|
|
* log tree in the case we do something like this:
|
|
|
|
|
* 1) create dir foo
|
|
|
|
|
* 2) create snapshot under dir foo
|
|
|
|
|
* 3) delete the snapshot
|
|
|
|
|
* 4) rmdir foo
|
|
|
|
|
* 5) mkdir foo
|
|
|
|
|
* 6) fsync foo or some file inside foo
|
|
|
|
|
*/
|
|
|
|
|
if (last_unlink_trans >= trans->transid)
|
|
|
|
|
BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
|
|
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
out:
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2021-12-15 15:40:03 -05:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-12-12 14:38:19 -05:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2016-01-21 15:55:56 +05:30
|
|
|
* btrfs_truncate_block - read, zero a chunk and write a block
|
2012-08-29 14:27:18 -04:00
|
|
|
* @inode - inode that we're zeroing
|
|
|
|
|
* @from - the offset to start zeroing
|
|
|
|
|
* @len - the length to zero, 0 to zero the entire range respective to the
|
|
|
|
|
* offset
|
|
|
|
|
* @front - zero up to the offset instead of from the offset on
|
|
|
|
|
*
|
2016-01-21 15:55:56 +05:30
|
|
|
* This will find the block for the "from" offset and cow the block and zero the
|
2012-08-29 14:27:18 -04:00
|
|
|
* part we want to zero. This is used with truncate and hole punching.
|
2007-06-12 06:35:45 -04:00
|
|
|
*/
|
2020-11-02 16:49:03 +02:00
|
|
|
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
|
|
|
|
|
int front)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2020-11-02 16:49:03 +02:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
struct address_space *mapping = inode->vfs_inode.i_mapping;
|
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
2008-07-17 12:53:50 -04:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2017-02-27 15:10:38 +08:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2020-06-24 07:23:50 +08:00
|
|
|
bool only_release_metadata = false;
|
2016-06-22 18:54:23 -04:00
|
|
|
u32 blocksize = fs_info->sectorsize;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
pgoff_t index = from >> PAGE_SHIFT;
|
2016-01-21 15:55:56 +05:30
|
|
|
unsigned offset = from & (blocksize - 1);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct page *page;
|
2011-09-21 15:05:58 -04:00
|
|
|
gfp_t mask = btrfs_alloc_write_mask(mapping);
|
2020-06-24 07:23:50 +08:00
|
|
|
size_t write_bytes = blocksize;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret = 0;
|
2016-01-21 15:55:56 +05:30
|
|
|
u64 block_start;
|
|
|
|
|
u64 block_end;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2018-01-18 14:47:06 +02:00
|
|
|
if (IS_ALIGNED(offset, blocksize) &&
|
|
|
|
|
(!len || IS_ALIGNED(len, blocksize)))
|
2007-06-12 06:35:45 -04:00
|
|
|
goto out;
|
2016-01-21 15:55:56 +05:30
|
|
|
|
2017-10-19 14:15:55 -04:00
|
|
|
block_start = round_down(from, blocksize);
|
|
|
|
|
block_end = block_start + blocksize - 1;
|
|
|
|
|
|
2020-11-02 16:49:03 +02:00
|
|
|
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
|
|
|
|
|
blocksize);
|
2020-06-24 07:23:50 +08:00
|
|
|
if (ret < 0) {
|
2020-11-02 16:49:03 +02:00
|
|
|
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
|
2020-06-24 07:23:50 +08:00
|
|
|
/* For nocow case, no need to reserve data space */
|
|
|
|
|
only_release_metadata = true;
|
|
|
|
|
} else {
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
|
|
|
ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
|
2020-06-24 07:23:50 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
|
if (!only_release_metadata)
|
2020-11-02 16:49:03 +02:00
|
|
|
btrfs_free_reserved_data_space(inode, data_reserved,
|
|
|
|
|
block_start, blocksize);
|
2020-06-24 07:23:50 +08:00
|
|
|
goto out;
|
|
|
|
|
}
|
2008-05-15 09:13:45 -04:00
|
|
|
again:
|
2011-09-21 15:05:58 -04:00
|
|
|
page = find_or_create_page(mapping, index, mask);
|
2009-10-13 16:46:49 -04:00
|
|
|
if (!page) {
|
2020-11-02 16:49:03 +02:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved, block_start,
|
|
|
|
|
blocksize, true);
|
|
|
|
|
btrfs_delalloc_release_extents(inode, blocksize);
|
2012-12-05 10:56:13 +00:00
|
|
|
ret = -ENOMEM;
|
2007-06-12 06:35:45 -04:00
|
|
|
goto out;
|
2009-10-13 16:46:49 -04:00
|
|
|
}
|
2021-01-26 16:34:00 +08:00
|
|
|
ret = set_page_extent_mapped(page);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
goto out_unlock;
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
if (!PageUptodate(page)) {
|
2022-04-29 11:12:16 -04:00
|
|
|
ret = btrfs_read_folio(NULL, page_folio(page));
|
2007-06-12 06:35:45 -04:00
|
|
|
lock_page(page);
|
2008-05-15 09:13:45 -04:00
|
|
|
if (page->mapping != mapping) {
|
|
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
2008-05-15 09:13:45 -04:00
|
|
|
goto again;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
|
ret = -EIO;
|
2008-07-24 09:41:53 -04:00
|
|
|
goto out_unlock;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
}
|
2008-05-15 09:13:45 -04:00
|
|
|
wait_on_page_writeback(page);
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2016-01-21 15:55:56 +05:30
|
|
|
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2020-11-02 16:49:03 +02:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, block_start);
|
2008-07-17 12:53:50 -04:00
|
|
|
if (ordered) {
|
2016-01-21 15:55:56 +05:30
|
|
|
unlock_extent_cached(io_tree, block_start, block_end,
|
2017-12-12 21:43:52 +01:00
|
|
|
&cached_state);
|
2008-07-17 12:53:50 -04:00
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
2020-09-18 12:15:53 +03:00
|
|
|
btrfs_start_ordered_extent(ordered, 1);
|
2008-07-17 12:53:50 -04:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
goto again;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-02 16:49:03 +02:00
|
|
|
clear_extent_bit(&inode->io_tree, block_start, block_end,
|
2019-08-15 14:04:04 -07:00
|
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
|
|
|
|
|
0, 0, &cached_state);
|
2009-10-13 16:46:49 -04:00
|
|
|
|
2020-11-02 16:49:03 +02:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
|
2019-07-17 16:18:17 +03:00
|
|
|
&cached_state);
|
2009-09-11 16:12:44 -04:00
|
|
|
if (ret) {
|
2016-01-21 15:55:56 +05:30
|
|
|
unlock_extent_cached(io_tree, block_start, block_end,
|
2017-12-12 21:43:52 +01:00
|
|
|
&cached_state);
|
2009-09-11 16:12:44 -04:00
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-21 15:55:56 +05:30
|
|
|
if (offset != blocksize) {
|
2012-08-29 14:27:18 -04:00
|
|
|
if (!len)
|
2016-01-21 15:55:56 +05:30
|
|
|
len = blocksize - offset;
|
2012-08-29 14:27:18 -04:00
|
|
|
if (front)
|
btrfs: use memzero_page() instead of open coded kmap pattern
There are many places where kmap/memset/kunmap patterns occur.
Use the newly lifted memzero_page() to eliminate direct uses of kmap and
leverage the new core functions use of kmap_local_page().
The development of this patch was aided by the following coccinelle
script:
// <smpl>
// SPDX-License-Identifier: GPL-2.0-only
// Find kmap/memset/kunmap pattern and replace with memset*page calls
//
// NOTE: Offsets and other expressions may be more complex than what the script
// will automatically generate. Therefore a catchall rule is provided to find
// the pattern which then must be evaluated by hand.
//
// Confidence: Low
// Copyright: (C) 2021 Intel Corporation
// URL: http://coccinelle.lip6.fr/
// Comments:
// Options:
//
// Then the memset pattern
//
@ memset_rule1 @
expression page, V, L, Off;
identifier ptr;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
-memset(ptr, 0, L);
+memzero_page(page, 0, L);
|
-memset(ptr + Off, 0, L);
+memzero_page(page, Off, L);
|
-memset(ptr, V, L);
+memset_page(page, V, 0, L);
|
-memset(ptr + Off, V, L);
+memset_page(page, V, Off, L);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule1
@
identifier memset_rule1.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
//
// Catch all
//
@ memset_rule2 @
expression page;
identifier ptr;
expression GenTo, GenSize, GenValue;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
//
// Some call sites have complex expressions within the memset/memcpy
// The follow are catch alls which need to be evaluated by hand.
//
-memset(GenTo, 0, GenSize);
+memzero_pageExtra(page, GenTo, GenSize);
|
-memset(GenTo, GenValue, GenSize);
+memset_pageExtra(page, GenValue, GenTo, GenSize);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule2
@
identifier memset_rule2.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
// </smpl>
Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-04 18:40:07 -07:00
|
|
|
memzero_page(page, (block_start - page_offset(page)),
|
|
|
|
|
offset);
|
2012-08-29 14:27:18 -04:00
|
|
|
else
|
btrfs: use memzero_page() instead of open coded kmap pattern
There are many places where kmap/memset/kunmap patterns occur.
Use the newly lifted memzero_page() to eliminate direct uses of kmap and
leverage the new core functions use of kmap_local_page().
The development of this patch was aided by the following coccinelle
script:
// <smpl>
// SPDX-License-Identifier: GPL-2.0-only
// Find kmap/memset/kunmap pattern and replace with memset*page calls
//
// NOTE: Offsets and other expressions may be more complex than what the script
// will automatically generate. Therefore a catchall rule is provided to find
// the pattern which then must be evaluated by hand.
//
// Confidence: Low
// Copyright: (C) 2021 Intel Corporation
// URL: http://coccinelle.lip6.fr/
// Comments:
// Options:
//
// Then the memset pattern
//
@ memset_rule1 @
expression page, V, L, Off;
identifier ptr;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
-memset(ptr, 0, L);
+memzero_page(page, 0, L);
|
-memset(ptr + Off, 0, L);
+memzero_page(page, Off, L);
|
-memset(ptr, V, L);
+memset_page(page, V, 0, L);
|
-memset(ptr + Off, V, L);
+memset_page(page, V, Off, L);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule1
@
identifier memset_rule1.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
//
// Catch all
//
@ memset_rule2 @
expression page;
identifier ptr;
expression GenTo, GenSize, GenValue;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
//
// Some call sites have complex expressions within the memset/memcpy
// The follow are catch alls which need to be evaluated by hand.
//
-memset(GenTo, 0, GenSize);
+memzero_pageExtra(page, GenTo, GenSize);
|
-memset(GenTo, GenValue, GenSize);
+memset_pageExtra(page, GenValue, GenTo, GenSize);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule2
@
identifier memset_rule2.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
// </smpl>
Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-04 18:40:07 -07:00
|
|
|
memzero_page(page, (block_start - page_offset(page)) + offset,
|
|
|
|
|
len);
|
2008-07-17 12:53:50 -04:00
|
|
|
}
|
2021-09-27 15:21:49 +08:00
|
|
|
btrfs_page_clear_checked(fs_info, page, block_start,
|
|
|
|
|
block_end + 1 - block_start);
|
2021-05-31 16:50:51 +08:00
|
|
|
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
|
2017-12-12 21:43:52 +01:00
|
|
|
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2020-06-24 07:23:50 +08:00
|
|
|
if (only_release_metadata)
|
2020-11-02 16:49:03 +02:00
|
|
|
set_extent_bit(&inode->io_tree, block_start, block_end,
|
2020-11-05 11:08:00 +02:00
|
|
|
EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
|
2020-06-24 07:23:50 +08:00
|
|
|
|
2008-07-24 09:41:53 -04:00
|
|
|
out_unlock:
|
2020-06-24 07:23:50 +08:00
|
|
|
if (ret) {
|
|
|
|
|
if (only_release_metadata)
|
2020-11-02 16:49:03 +02:00
|
|
|
btrfs_delalloc_release_metadata(inode, blocksize, true);
|
2020-06-24 07:23:50 +08:00
|
|
|
else
|
2020-11-02 16:49:03 +02:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved,
|
2020-06-24 07:23:50 +08:00
|
|
|
block_start, blocksize, true);
|
|
|
|
|
}
|
2020-11-02 16:49:03 +02:00
|
|
|
btrfs_delalloc_release_extents(inode, blocksize);
|
2007-06-12 06:35:45 -04:00
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
2007-06-12 06:35:45 -04:00
|
|
|
out:
|
2020-06-24 07:23:50 +08:00
|
|
|
if (only_release_metadata)
|
2020-11-02 16:49:03 +02:00
|
|
|
btrfs_check_nocow_unlock(inode);
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_free(data_reserved);
|
2007-06-12 06:35:45 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-02 16:49:00 +02:00
|
|
|
static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
|
2013-10-22 12:18:51 -04:00
|
|
|
u64 offset, u64 len)
|
|
|
|
|
{
|
2020-11-02 16:49:00 +02:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2013-10-22 12:18:51 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
2020-11-04 11:07:32 +00:00
|
|
|
struct btrfs_drop_extents_args drop_args = { 0 };
|
2013-10-22 12:18:51 -04:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
/*
|
btrfs: remove racy and unnecessary inode transaction update when using no-holes
When using the NO_HOLES feature and expanding the size of an inode, we
update the inode's last_trans, last_sub_trans and last_log_commit fields
at maybe_insert_hole() so that a fsync does know that the inode needs to
be logged (by making sure that btrfs_inode_in_log() returns false). This
happens for expanding truncate operations, buffered writes, direct IO
writes and when cloning extents to an offset greater than the inode's
i_size.
However the way we do it is racy, because in between setting the inode's
last_sub_trans and last_log_commit fields, the log transaction ID that was
assigned to last_sub_trans might be committed before we read the root's
last_log_commit and assign that value to last_log_commit. If that happens
it would make a future call to btrfs_inode_in_log() return true. This is
a race that should be extremely unlikely to be hit in practice, and it is
the same that was described by commit bc0939fcfab0d7 ("btrfs: fix race
between marking inode needs to be logged and log syncing").
The fix would simply be to set last_log_commit to the value we assigned
to last_sub_trans minus 1, like it was done in that commit. However
updating these two fields plus the last_trans field is pointless here
because all the callers of btrfs_cont_expand() (which is the only
caller of maybe_insert_hole()) always call btrfs_set_inode_last_trans()
or btrfs_update_inode() after calling btrfs_cont_expand(). Calling either
btrfs_set_inode_last_trans() or btrfs_update_inode() guarantees that the
next fsync will log the inode, as it makes btrfs_inode_in_log() return
false.
So just remove the code that explicitly sets the inode's last_trans,
last_sub_trans and last_log_commit fields.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-20 16:03:40 +01:00
|
|
|
* If NO_HOLES is enabled, we don't need to do anything.
|
|
|
|
|
* Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
|
|
|
|
|
* or btrfs_update_inode() will be called, which guarantee that the next
|
|
|
|
|
* fsync will know this inode was changed and needs to be logged.
|
2013-10-22 12:18:51 -04:00
|
|
|
*/
|
btrfs: remove racy and unnecessary inode transaction update when using no-holes
When using the NO_HOLES feature and expanding the size of an inode, we
update the inode's last_trans, last_sub_trans and last_log_commit fields
at maybe_insert_hole() so that a fsync does know that the inode needs to
be logged (by making sure that btrfs_inode_in_log() returns false). This
happens for expanding truncate operations, buffered writes, direct IO
writes and when cloning extents to an offset greater than the inode's
i_size.
However the way we do it is racy, because in between setting the inode's
last_sub_trans and last_log_commit fields, the log transaction ID that was
assigned to last_sub_trans might be committed before we read the root's
last_log_commit and assign that value to last_log_commit. If that happens
it would make a future call to btrfs_inode_in_log() return true. This is
a race that should be extremely unlikely to be hit in practice, and it is
the same that was described by commit bc0939fcfab0d7 ("btrfs: fix race
between marking inode needs to be logged and log syncing").
The fix would simply be to set last_log_commit to the value we assigned
to last_sub_trans minus 1, like it was done in that commit. However
updating these two fields plus the last_trans field is pointless here
because all the callers of btrfs_cont_expand() (which is the only
caller of maybe_insert_hole()) always call btrfs_set_inode_last_trans()
or btrfs_update_inode() after calling btrfs_cont_expand(). Calling either
btrfs_set_inode_last_trans() or btrfs_update_inode() guarantees that the
next fsync will log the inode, as it makes btrfs_inode_in_log() return
false.
So just remove the code that explicitly sets the inode's last_trans,
last_sub_trans and last_log_commit fields.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-07-20 16:03:40 +01:00
|
|
|
if (btrfs_fs_incompat(fs_info, NO_HOLES))
|
2013-10-22 12:18:51 -04:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* 1 - for the one we're dropping
|
|
|
|
|
* 1 - for the one we're adding
|
|
|
|
|
* 1 - for updating the inode.
|
|
|
|
|
*/
|
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
2020-11-04 11:07:32 +00:00
|
|
|
drop_args.start = offset;
|
|
|
|
|
drop_args.end = offset + len;
|
|
|
|
|
drop_args.drop_cache = true;
|
|
|
|
|
|
2020-11-02 16:49:00 +02:00
|
|
|
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
|
2013-10-22 12:18:51 -04:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2013-10-22 12:18:51 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-23 18:25:29 -04:00
|
|
|
ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
} else {
|
2020-11-02 16:49:00 +02:00
|
|
|
btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
|
|
|
|
|
btrfs_update_inode(trans, root, inode);
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
}
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2013-10-22 12:18:51 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2011-03-04 15:46:53 -05:00
|
|
|
/*
|
|
|
|
|
* This function puts in dummy file extents for the area we're creating a hole
|
|
|
|
|
* for. So if we are truncating this file to a larger size we need to insert
|
|
|
|
|
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
|
|
|
|
|
* the range between oldsize and size
|
|
|
|
|
*/
|
2020-11-02 16:49:04 +02:00
|
|
|
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2020-11-02 16:49:04 +02:00
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
2010-05-16 10:48:46 -04:00
|
|
|
struct extent_map *em = NULL;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2020-11-02 16:49:04 +02:00
|
|
|
struct extent_map_tree *em_tree = &inode->extent_tree;
|
2016-06-22 18:54:23 -04:00
|
|
|
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
|
|
|
|
|
u64 block_end = ALIGN(size, fs_info->sectorsize);
|
2008-10-30 14:19:41 -04:00
|
|
|
u64 last_byte;
|
|
|
|
|
u64 cur_offset;
|
|
|
|
|
u64 hole_size;
|
2009-09-11 16:12:44 -04:00
|
|
|
int err = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2013-06-17 17:14:39 -04:00
|
|
|
/*
|
2016-01-21 15:55:56 +05:30
|
|
|
* If our size started in the middle of a block we need to zero out the
|
|
|
|
|
* rest of the block before we expand the i_size, otherwise we could
|
2013-06-17 17:14:39 -04:00
|
|
|
* expose stale data.
|
|
|
|
|
*/
|
2020-11-02 16:49:04 +02:00
|
|
|
err = btrfs_truncate_block(inode, oldsize, 0, 0);
|
2013-06-17 17:14:39 -04:00
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
2008-10-30 14:19:41 -04:00
|
|
|
if (size <= hole_start)
|
|
|
|
|
return 0;
|
|
|
|
|
|
2020-11-02 16:49:04 +02:00
|
|
|
btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
|
|
|
|
|
&cached_state);
|
2008-10-30 14:19:41 -04:00
|
|
|
cur_offset = hole_start;
|
|
|
|
|
while (1) {
|
2020-11-02 16:49:04 +02:00
|
|
|
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
|
2019-12-02 17:34:23 -08:00
|
|
|
block_end - cur_offset);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
err = PTR_ERR(em);
|
2013-01-08 19:37:58 +00:00
|
|
|
em = NULL;
|
2012-03-12 16:03:00 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2008-10-30 14:19:41 -04:00
|
|
|
last_byte = min(extent_map_end(em), block_end);
|
2016-06-22 18:54:23 -04:00
|
|
|
last_byte = ALIGN(last_byte, fs_info->sectorsize);
|
2020-01-17 09:02:22 -05:00
|
|
|
hole_size = last_byte - cur_offset;
|
|
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
struct extent_map *hole_em;
|
2009-09-11 16:12:44 -04:00
|
|
|
|
2020-11-02 16:49:04 +02:00
|
|
|
err = maybe_insert_hole(root, inode, cur_offset,
|
|
|
|
|
hole_size);
|
2013-10-22 12:18:51 -04:00
|
|
|
if (err)
|
2011-01-31 16:03:11 -05:00
|
|
|
break;
|
2020-01-17 09:02:22 -05:00
|
|
|
|
2020-11-02 16:49:04 +02:00
|
|
|
err = btrfs_inode_set_file_extent_range(inode,
|
2020-01-17 09:02:22 -05:00
|
|
|
cur_offset, hole_size);
|
|
|
|
|
if (err)
|
|
|
|
|
break;
|
|
|
|
|
|
2020-11-02 16:49:04 +02:00
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
cur_offset + hole_size - 1, 0);
|
|
|
|
|
hole_em = alloc_extent_map();
|
|
|
|
|
if (!hole_em) {
|
btrfs: reset last_reflink_trans after fsyncing inode
When an inode has a last_reflink_trans matching the current transaction,
we have to take special care when logging its checksums in order to
avoid getting checksum items with overlapping ranges in a log tree,
which could result in missing checksums after log replay (more on that
in the changelogs of commit 40e046acbd2f36 ("Btrfs: fix missing data
checksums after replaying a log tree") and commit e289f03ea79bbc ("btrfs:
fix corrupt log due to concurrent fsync of inodes with shared extents")).
We also need to make sure a full fsync will copy all old file extent
items it finds in modified leaves, because they might have been copied
from some other inode.
However once we fsync an inode, we don't need to keep paying the price of
that extra special care in future fsyncs done in the same transaction,
unless the inode is used for another reflink operation or the full sync
flag is set on it (truncate, failure to allocate extent maps for holes,
and other exceptional and infrequent cases).
So after we fsync an inode reset its last_unlink_trans to zero. In case
another reflink happens, we continue to update the last_reflink_trans of
the inode, just as before. Also set last_reflink_trans to the generation
of the last transaction that modified the inode whenever we need to set
the full sync flag on the inode, just like when we need to load an inode
from disk after eviction.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-02-17 12:12:06 +00:00
|
|
|
btrfs_set_inode_full_sync(inode);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
goto next;
|
|
|
|
|
}
|
|
|
|
|
hole_em->start = cur_offset;
|
|
|
|
|
hole_em->len = hole_size;
|
|
|
|
|
hole_em->orig_start = cur_offset;
|
2009-11-12 09:35:36 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
hole_em->block_start = EXTENT_MAP_HOLE;
|
|
|
|
|
hole_em->block_len = 0;
|
2012-12-03 10:31:19 -05:00
|
|
|
hole_em->orig_block_len = 0;
|
2013-04-04 14:31:27 -04:00
|
|
|
hole_em->ram_bytes = hole_size;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
hole_em->compress_type = BTRFS_COMPRESS_NONE;
|
2016-06-22 18:54:23 -04:00
|
|
|
hole_em->generation = fs_info->generation;
|
2009-11-12 09:35:36 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
while (1) {
|
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-05 16:51:15 -04:00
|
|
|
err = add_extent_mapping(em_tree, hole_em, 1);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
|
if (err != -EEXIST)
|
|
|
|
|
break;
|
2020-11-02 16:49:04 +02:00
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
cur_offset +
|
|
|
|
|
hole_size - 1, 0);
|
|
|
|
|
}
|
|
|
|
|
free_extent_map(hole_em);
|
2020-01-17 09:02:22 -05:00
|
|
|
} else {
|
2020-11-02 16:49:04 +02:00
|
|
|
err = btrfs_inode_set_file_extent_range(inode,
|
2020-01-17 09:02:22 -05:00
|
|
|
cur_offset, hole_size);
|
|
|
|
|
if (err)
|
|
|
|
|
break;
|
2008-10-30 14:19:41 -04:00
|
|
|
}
|
2013-10-22 12:18:51 -04:00
|
|
|
next:
|
2008-10-30 14:19:41 -04:00
|
|
|
free_extent_map(em);
|
2010-05-16 10:48:46 -04:00
|
|
|
em = NULL;
|
2008-10-30 14:19:41 -04:00
|
|
|
cur_offset = last_byte;
|
2009-11-12 09:35:36 +00:00
|
|
|
if (cur_offset >= block_end)
|
2008-10-30 14:19:41 -04:00
|
|
|
break;
|
|
|
|
|
}
|
2010-05-16 10:48:46 -04:00
|
|
|
free_extent_map(em);
|
2017-12-12 21:43:52 +01:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
|
2008-10-30 14:19:41 -04:00
|
|
|
return err;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2013-01-12 02:57:22 +00:00
|
|
|
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
|
2009-11-12 09:35:36 +00:00
|
|
|
{
|
2011-12-14 20:12:01 -05:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_trans_handle *trans;
|
2011-01-31 15:30:16 -05:00
|
|
|
loff_t oldsize = i_size_read(inode);
|
2013-01-12 02:57:22 +00:00
|
|
|
loff_t newsize = attr->ia_size;
|
|
|
|
|
int mask = attr->ia_valid;
|
2009-11-12 09:35:36 +00:00
|
|
|
int ret;
|
|
|
|
|
|
2013-01-12 02:57:22 +00:00
|
|
|
/*
|
|
|
|
|
* The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
|
|
|
|
|
* special case where we need to update the times despite not having
|
|
|
|
|
* these flags set. For all other operations the VFS set these flags
|
|
|
|
|
* explicitly if it wants a timestamp update.
|
|
|
|
|
*/
|
2013-11-19 07:17:07 -08:00
|
|
|
if (newsize != oldsize) {
|
|
|
|
|
inode_inc_iversion(inode);
|
2022-06-21 18:40:48 +02:00
|
|
|
if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
|
|
|
|
|
inode->i_mtime = current_time(inode);
|
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
|
|
|
|
}
|
2013-11-19 07:17:07 -08:00
|
|
|
}
|
2013-01-12 02:57:22 +00:00
|
|
|
|
2011-01-31 15:30:16 -05:00
|
|
|
if (newsize > oldsize) {
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
/*
|
2017-06-22 02:19:11 +02:00
|
|
|
* Don't do an expanding truncate while snapshotting is ongoing.
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
* This is to ensure the snapshot captures a fully consistent
|
|
|
|
|
* state of this file - if the snapshot captures this expanding
|
|
|
|
|
* truncation, it must capture all writes that happened before
|
|
|
|
|
* this truncation.
|
|
|
|
|
*/
|
2020-01-30 14:59:45 +02:00
|
|
|
btrfs_drew_write_lock(&root->snapshot_lock);
|
2020-11-02 16:49:04 +02:00
|
|
|
ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
if (ret) {
|
2020-01-30 14:59:45 +02:00
|
|
|
btrfs_drew_write_unlock(&root->snapshot_lock);
|
2009-11-12 09:35:36 +00:00
|
|
|
return ret;
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
|
2011-12-14 20:12:01 -05:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
if (IS_ERR(trans)) {
|
2020-01-30 14:59:45 +02:00
|
|
|
btrfs_drew_write_unlock(&root->snapshot_lock);
|
2011-12-14 20:12:01 -05:00
|
|
|
return PTR_ERR(trans);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
}
|
2011-12-14 20:12:01 -05:00
|
|
|
|
|
|
|
|
i_size_write(inode, newsize);
|
2020-11-02 16:48:53 +02:00
|
|
|
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
|
2016-01-21 15:56:03 +05:30
|
|
|
pagecache_isize_extended(inode, oldsize, newsize);
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2020-01-30 14:59:45 +02:00
|
|
|
btrfs_drew_write_unlock(&root->snapshot_lock);
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2011-01-31 15:30:16 -05:00
|
|
|
} else {
|
2021-02-04 19:22:09 +09:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
|
|
|
|
|
if (btrfs_is_zoned(fs_info)) {
|
|
|
|
|
ret = btrfs_wait_ordered_range(inode,
|
|
|
|
|
ALIGN(newsize, fs_info->sectorsize),
|
|
|
|
|
(u64)-1);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
|
2011-01-31 15:30:16 -05:00
|
|
|
/*
|
|
|
|
|
* We're truncating a file that used to have good data down to
|
2020-10-01 09:40:39 +03:00
|
|
|
* zero. Make sure any new writes to the file get on disk
|
|
|
|
|
* on close.
|
2011-01-31 15:30:16 -05:00
|
|
|
*/
|
|
|
|
|
if (newsize == 0)
|
2020-10-01 09:40:39 +03:00
|
|
|
set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
|
2012-05-23 14:13:11 -04:00
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2009-11-12 09:35:36 +00:00
|
|
|
|
2011-01-31 15:30:16 -05:00
|
|
|
truncate_setsize(inode, newsize);
|
2013-02-08 07:01:08 +00:00
|
|
|
|
|
|
|
|
inode_dio_wait(inode);
|
|
|
|
|
|
2018-02-06 20:40:31 +00:00
|
|
|
ret = btrfs_truncate(inode, newsize == oldsize);
|
2013-08-29 16:43:28 -04:00
|
|
|
if (ret && inode->i_nlink) {
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
/*
|
2018-05-11 13:13:32 -07:00
|
|
|
* Truncate failed, so fix up the in-memory size. We
|
|
|
|
|
* adjusted disk_i_size down as we removed extents, so
|
|
|
|
|
* wait for disk_i_size to be stable and then update the
|
|
|
|
|
* in-memory size to match.
|
2013-08-29 16:43:28 -04:00
|
|
|
*/
|
2018-05-11 13:13:32 -07:00
|
|
|
err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
2013-08-29 16:43:28 -04:00
|
|
|
if (err)
|
2018-05-11 13:13:32 -07:00
|
|
|
return err;
|
|
|
|
|
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
|
2013-08-29 16:43:28 -04:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
}
|
|
|
|
|
|
2011-01-31 15:30:16 -05:00
|
|
|
return ret;
|
2009-11-12 09:35:36 +00:00
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
|
|
|
|
|
struct iattr *attr)
|
2008-10-30 14:19:41 -04:00
|
|
|
{
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2010-12-20 16:04:08 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-10-30 14:19:41 -04:00
|
|
|
int err;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2010-12-20 16:04:08 +08:00
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
|
return -EROFS;
|
|
|
|
|
|
2021-07-27 12:48:49 +02:00
|
|
|
err = setattr_prepare(mnt_userns, dentry, attr);
|
2008-10-30 14:19:41 -04:00
|
|
|
if (err)
|
|
|
|
|
return err;
|
2007-08-30 11:54:02 -04:00
|
|
|
|
2009-03-31 13:27:11 -04:00
|
|
|
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
|
2013-01-12 02:57:22 +00:00
|
|
|
err = btrfs_setsize(inode, attr);
|
2009-11-12 09:35:36 +00:00
|
|
|
if (err)
|
|
|
|
|
return err;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
2008-10-30 14:19:41 -04:00
|
|
|
|
2010-06-04 11:30:02 +02:00
|
|
|
if (attr->ia_valid) {
|
2021-07-27 12:48:49 +02:00
|
|
|
setattr_copy(mnt_userns, inode, attr);
|
2012-04-05 15:03:02 -04:00
|
|
|
inode_inc_iversion(inode);
|
2011-11-30 10:45:38 -05:00
|
|
|
err = btrfs_dirty_inode(inode);
|
2010-06-04 11:30:02 +02:00
|
|
|
|
2011-11-30 10:45:38 -05:00
|
|
|
if (!err && attr->ia_valid & ATTR_MODE)
|
2021-07-27 12:48:49 +02:00
|
|
|
err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
|
2010-06-04 11:30:02 +02:00
|
|
|
}
|
2008-07-24 12:16:36 -04:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
return err;
|
|
|
|
|
}
|
2008-01-14 16:24:38 -05:00
|
|
|
|
2013-11-19 22:29:35 +00:00
|
|
|
/*
|
2022-02-09 20:21:39 +00:00
|
|
|
* While truncating the inode pages during eviction, we get the VFS
|
|
|
|
|
* calling btrfs_invalidate_folio() against each folio of the inode. This
|
|
|
|
|
* is slow because the calls to btrfs_invalidate_folio() result in a
|
|
|
|
|
* huge amount of calls to lock_extent_bits() and clear_extent_bit(),
|
|
|
|
|
* which keep merging and splitting extent_state structures over and over,
|
|
|
|
|
* wasting lots of time.
|
2013-11-19 22:29:35 +00:00
|
|
|
*
|
2022-02-09 20:21:39 +00:00
|
|
|
* Therefore if the inode is being evicted, let btrfs_invalidate_folio()
|
|
|
|
|
* skip all those expensive operations on a per folio basis and do only
|
|
|
|
|
* the ordered io finishing, while we release here the extent_map and
|
|
|
|
|
* extent_state structures, without the excessive merging and splitting.
|
2013-11-19 22:29:35 +00:00
|
|
|
*/
|
|
|
|
|
static void evict_inode_truncate_pages(struct inode *inode)
|
|
|
|
|
{
|
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
|
struct rb_node *node;
|
|
|
|
|
|
|
|
|
|
ASSERT(inode->i_state & I_FREEING);
|
2014-04-03 14:47:49 -07:00
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
|
write_lock(&map_tree->lock);
|
2018-08-23 03:51:52 +08:00
|
|
|
while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
|
2013-11-19 22:29:35 +00:00
|
|
|
struct extent_map *em;
|
|
|
|
|
|
2018-08-23 03:51:52 +08:00
|
|
|
node = rb_first_cached(&map_tree->map);
|
2013-11-19 22:29:35 +00:00
|
|
|
em = rb_entry(node, struct extent_map, rb_node);
|
2013-12-14 15:27:31 +08:00
|
|
|
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
|
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
|
2013-11-19 22:29:35 +00:00
|
|
|
remove_extent_mapping(map_tree, em);
|
|
|
|
|
free_extent_map(em);
|
2014-08-08 02:47:05 +01:00
|
|
|
if (need_resched()) {
|
|
|
|
|
write_unlock(&map_tree->lock);
|
|
|
|
|
cond_resched();
|
|
|
|
|
write_lock(&map_tree->lock);
|
|
|
|
|
}
|
2013-11-19 22:29:35 +00:00
|
|
|
}
|
|
|
|
|
write_unlock(&map_tree->lock);
|
|
|
|
|
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 00:55:42 +01:00
|
|
|
/*
|
|
|
|
|
* Keep looping until we have no more ranges in the io tree.
|
2020-06-01 21:47:05 -07:00
|
|
|
* We can have ongoing bios started by readahead that have
|
|
|
|
|
* their endio callback (extent_io.c:end_bio_extent_readpage)
|
2015-06-10 12:55:41 +01:00
|
|
|
* still in progress (unlocked the pages in the bio but did not yet
|
|
|
|
|
* unlocked the ranges in the io tree). Therefore this means some
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 00:55:42 +01:00
|
|
|
* ranges can still be locked and eviction started because before
|
|
|
|
|
* submitting those bios, which are executed by a separate task (work
|
|
|
|
|
* queue kthread), inode references (inode->i_count) were not taken
|
|
|
|
|
* (which would be dropped in the end io callback of each bio).
|
|
|
|
|
* Therefore here we effectively end up waiting for those bios and
|
|
|
|
|
* anyone else holding locked ranges without having bumped the inode's
|
|
|
|
|
* reference count - if we don't do it, when they access the inode's
|
|
|
|
|
* io_tree to unlock a range it may be too late, leading to an
|
|
|
|
|
* use-after-free issue.
|
|
|
|
|
*/
|
2013-11-19 22:29:35 +00:00
|
|
|
spin_lock(&io_tree->lock);
|
|
|
|
|
while (!RB_EMPTY_ROOT(&io_tree->state)) {
|
|
|
|
|
struct extent_state *state;
|
|
|
|
|
struct extent_state *cached_state = NULL;
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 00:55:42 +01:00
|
|
|
u64 start;
|
|
|
|
|
u64 end;
|
2018-10-12 13:02:48 +01:00
|
|
|
unsigned state_flags;
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
|
node = rb_first(&io_tree->state);
|
|
|
|
|
state = rb_entry(node, struct extent_state, rb_node);
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 00:55:42 +01:00
|
|
|
start = state->start;
|
|
|
|
|
end = state->end;
|
2018-10-12 13:02:48 +01:00
|
|
|
state_flags = state->state;
|
2013-11-19 22:29:35 +00:00
|
|
|
spin_unlock(&io_tree->lock);
|
|
|
|
|
|
2015-12-03 14:30:40 +01:00
|
|
|
lock_extent_bits(io_tree, start, end, &cached_state);
|
2015-09-29 10:35:16 +08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If still has DELALLOC flag, the extent didn't reach disk,
|
|
|
|
|
* and its reserved space won't be freed by delayed_ref.
|
|
|
|
|
* So we need to free its reserved space here.
|
2022-02-09 20:21:39 +00:00
|
|
|
* (Refer to comment in btrfs_invalidate_folio, case 2)
|
2015-09-29 10:35:16 +08:00
|
|
|
*
|
|
|
|
|
* Note, end is the bytenr of last byte, so we need + 1 here.
|
|
|
|
|
*/
|
2018-10-12 13:02:48 +01:00
|
|
|
if (state_flags & EXTENT_DELALLOC)
|
2020-06-03 08:55:11 +03:00
|
|
|
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
|
|
|
|
|
end - start + 1);
|
2015-09-29 10:35:16 +08:00
|
|
|
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 00:55:42 +01:00
|
|
|
clear_extent_bit(io_tree, start, end,
|
2019-08-15 14:04:04 -07:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
|
|
|
|
|
&cached_state);
|
2013-11-19 22:29:35 +00:00
|
|
|
|
2014-08-08 02:47:05 +01:00
|
|
|
cond_resched();
|
2013-11-19 22:29:35 +00:00
|
|
|
spin_lock(&io_tree->lock);
|
|
|
|
|
}
|
|
|
|
|
spin_unlock(&io_tree->lock);
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-11 13:13:36 -07:00
|
|
|
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
|
2018-09-28 07:18:19 -04:00
|
|
|
struct btrfs_block_rsv *rsv)
|
2018-05-11 13:13:36 -07:00
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2019-08-01 18:19:37 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
2019-08-22 15:14:33 -04:00
|
|
|
u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
|
2019-08-01 18:19:37 -04:00
|
|
|
int ret;
|
2018-05-11 13:13:36 -07:00
|
|
|
|
2019-08-01 18:19:37 -04:00
|
|
|
/*
|
|
|
|
|
* Eviction should be taking place at some place safe because of our
|
|
|
|
|
* delayed iputs. However the normal flushing code will run delayed
|
|
|
|
|
* iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
|
|
|
|
|
*
|
|
|
|
|
* We reserve the delayed_refs_extra here again because we can't use
|
|
|
|
|
* btrfs_start_transaction(root, 0) for the same deadlocky reason as
|
|
|
|
|
* above. We reserve our extra bit here because we generate a ton of
|
|
|
|
|
* delayed refs activity by truncating.
|
|
|
|
|
*
|
2021-11-09 10:12:04 -05:00
|
|
|
* BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
|
|
|
|
|
* if we fail to make this reservation we can re-try without the
|
|
|
|
|
* delayed_refs_extra so we can make some forward progress.
|
2019-08-01 18:19:37 -04:00
|
|
|
*/
|
2021-11-09 10:12:07 -05:00
|
|
|
ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
|
2019-08-01 18:19:37 -04:00
|
|
|
BTRFS_RESERVE_FLUSH_EVICT);
|
|
|
|
|
if (ret) {
|
2021-11-09 10:12:07 -05:00
|
|
|
ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
|
2021-11-09 10:12:04 -05:00
|
|
|
BTRFS_RESERVE_FLUSH_EVICT);
|
|
|
|
|
if (ret) {
|
2019-08-01 18:19:37 -04:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"could not allocate space for delete; will truncate on mount");
|
|
|
|
|
return ERR_PTR(-ENOSPC);
|
|
|
|
|
}
|
|
|
|
|
delayed_refs_extra = 0;
|
|
|
|
|
}
|
2018-05-11 13:13:36 -07:00
|
|
|
|
2019-08-01 18:19:37 -04:00
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
|
return trans;
|
|
|
|
|
|
|
|
|
|
if (delayed_refs_extra) {
|
|
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
|
|
|
trans->bytes_reserved = delayed_refs_extra;
|
|
|
|
|
btrfs_block_rsv_migrate(rsv, trans->block_rsv,
|
|
|
|
|
delayed_refs_extra, 1);
|
2018-05-11 13:13:36 -07:00
|
|
|
}
|
2019-08-01 18:19:37 -04:00
|
|
|
return trans;
|
2018-05-11 13:13:36 -07:00
|
|
|
}
|
|
|
|
|
|
2010-06-07 11:35:40 -04:00
|
|
|
void btrfs_evict_inode(struct inode *inode)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2018-05-11 13:13:36 -07:00
|
|
|
struct btrfs_block_rsv *rsv;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret;
|
|
|
|
|
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
trace_btrfs_inode_evict(inode);
|
|
|
|
|
|
2016-06-29 09:46:41 +03:00
|
|
|
if (!root) {
|
2021-06-30 13:01:49 -07:00
|
|
|
fsverity_cleanup_inode(inode);
|
2018-01-25 11:02:53 -07:00
|
|
|
clear_inode(inode);
|
2016-06-29 09:46:41 +03:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2013-11-19 22:29:35 +00:00
|
|
|
evict_inode_truncate_pages(inode);
|
|
|
|
|
|
2013-09-05 16:58:43 +02:00
|
|
|
if (inode->i_nlink &&
|
|
|
|
|
((btrfs_root_refs(&root->root_item) != 0 &&
|
|
|
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
|
2017-02-20 13:50:35 +02:00
|
|
|
btrfs_is_free_space_inode(BTRFS_I(inode))))
|
2010-06-07 11:35:40 -04:00
|
|
|
goto no_delete;
|
|
|
|
|
|
2018-05-11 13:13:37 -07:00
|
|
|
if (is_bad_inode(inode))
|
2007-06-12 06:35:45 -04:00
|
|
|
goto no_delete;
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2017-02-20 13:50:57 +02:00
|
|
|
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
|
Btrfs: cleanup the read failure record after write or when the inode is freeing
After the data is written successfully, we should cleanup the read failure record
in that range because
- If we set data COW for the file, the range that the failure record pointed to is
mapped to a new place, so it is invalid.
- If we set no data COW for the file, and if there is no error during writting,
the corrupted data is corrected, so the failure record can be removed. And if
some errors happen on the mirrors, we also needn't worry about it because the
failure record will be recreated if we read the same place again.
Sometimes, we may fail to correct the data, so the failure records will be left
in the tree, we need free them when we free the inode or the memory leak happens.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-12 18:44:04 +08:00
|
|
|
|
2018-05-11 13:13:33 -07:00
|
|
|
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
|
2009-11-12 09:34:40 +00:00
|
|
|
goto no_delete;
|
|
|
|
|
|
2009-09-21 16:00:26 -04:00
|
|
|
if (inode->i_nlink > 0) {
|
2013-09-05 16:58:43 +02:00
|
|
|
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
|
|
|
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
|
2009-09-21 16:00:26 -04:00
|
|
|
goto no_delete;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-03 17:18:07 -05:00
|
|
|
/*
|
|
|
|
|
* This makes sure the inode item in tree is uptodate and the space for
|
|
|
|
|
* the inode update is released.
|
|
|
|
|
*/
|
2017-01-10 20:35:40 +02:00
|
|
|
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
|
2018-05-11 13:13:37 -07:00
|
|
|
if (ret)
|
2012-12-19 06:59:51 +00:00
|
|
|
goto no_delete;
|
|
|
|
|
|
2021-12-03 17:18:07 -05:00
|
|
|
/*
|
|
|
|
|
* This drops any pending insert or delete operations we have for this
|
|
|
|
|
* inode. We could have a delayed dir index deletion queued up, but
|
|
|
|
|
* we're removing the inode completely so that'll be taken care of in
|
|
|
|
|
* the truncate.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
|
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
|
2018-05-11 13:13:37 -07:00
|
|
|
if (!rsv)
|
2011-08-05 13:22:24 -04:00
|
|
|
goto no_delete;
|
2019-08-22 15:14:33 -04:00
|
|
|
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
|
2022-06-23 17:08:14 +02:00
|
|
|
rsv->failfast = true;
|
2011-08-05 13:22:24 -04:00
|
|
|
|
2017-02-20 13:50:34 +02:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), 0);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
while (1) {
|
2021-12-03 17:18:09 -05:00
|
|
|
struct btrfs_truncate_control control = {
|
2021-12-03 17:18:15 -05:00
|
|
|
.inode = BTRFS_I(inode),
|
2021-12-03 17:18:14 -05:00
|
|
|
.ino = btrfs_ino(BTRFS_I(inode)),
|
2021-12-03 17:18:09 -05:00
|
|
|
.new_size = 0,
|
|
|
|
|
.min_type = 0,
|
|
|
|
|
};
|
|
|
|
|
|
2018-09-28 07:18:19 -04:00
|
|
|
trans = evict_refill_and_join(root, rsv);
|
2018-05-11 13:13:37 -07:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
|
goto free_rsv;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2011-08-05 13:22:24 -04:00
|
|
|
trans->block_rsv = rsv;
|
|
|
|
|
|
2021-12-03 17:18:15 -05:00
|
|
|
ret = btrfs_truncate_inode_items(trans, root, &control);
|
2018-05-11 13:13:37 -07:00
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
|
|
|
|
if (ret && ret != -ENOSPC && ret != -EAGAIN)
|
|
|
|
|
goto free_rsv;
|
|
|
|
|
else if (!ret)
|
2009-11-12 09:35:36 +00:00
|
|
|
break;
|
|
|
|
|
}
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2013-08-13 14:10:08 -04:00
|
|
|
/*
|
2018-05-11 13:13:37 -07:00
|
|
|
* Errors here aren't a big deal, it just means we leave orphan items in
|
|
|
|
|
* the tree. They will be cleaned up on the next mount. If the inode
|
|
|
|
|
* number gets reused, cleanup deletes the orphan item without doing
|
|
|
|
|
* anything, and unlink reuses the existing orphan item.
|
|
|
|
|
*
|
|
|
|
|
* If it turns out that we are dropping too many of these, we might want
|
|
|
|
|
* to add a mechanism for retrying these after a commit.
|
2013-08-13 14:10:08 -04:00
|
|
|
*/
|
2018-09-28 07:18:19 -04:00
|
|
|
trans = evict_refill_and_join(root, rsv);
|
2018-05-11 13:13:37 -07:00
|
|
|
if (!IS_ERR(trans)) {
|
|
|
|
|
trans->block_rsv = rsv;
|
|
|
|
|
btrfs_orphan_del(trans, BTRFS_I(inode));
|
|
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
|
}
|
2007-06-22 14:16:25 -04:00
|
|
|
|
2018-05-11 13:13:37 -07:00
|
|
|
free_rsv:
|
|
|
|
|
btrfs_free_block_rsv(fs_info, rsv);
|
2007-06-12 06:35:45 -04:00
|
|
|
no_delete:
|
2018-05-11 13:13:37 -07:00
|
|
|
/*
|
|
|
|
|
* If we didn't successfully delete, the orphan item will still be in
|
|
|
|
|
* the tree and we'll retry on the next mount. Again, we might also want
|
|
|
|
|
* to retry these periodically in the future.
|
|
|
|
|
*/
|
2017-01-10 20:35:39 +02:00
|
|
|
btrfs_remove_delayed_node(BTRFS_I(inode));
|
2021-06-30 13:01:49 -07:00
|
|
|
fsverity_cleanup_inode(inode);
|
2012-05-03 14:48:02 +02:00
|
|
|
clear_inode(inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2019-03-13 13:55:11 +08:00
|
|
|
* Return the key found in the dir entry in the location pointer, fill @type
|
|
|
|
|
* with BTRFS_FT_*, and return 0.
|
|
|
|
|
*
|
2018-03-05 17:13:37 +08:00
|
|
|
* If no dir entries were found, returns -ENOENT.
|
|
|
|
|
* If found a corrupted location in dir entry, returns -EUCLEAN.
|
2007-06-12 06:35:45 -04:00
|
|
|
*/
|
|
|
|
|
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
|
2019-03-13 13:55:11 +08:00
|
|
|
struct btrfs_key *location, u8 *type)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
|
|
|
|
const char *name = dentry->d_name.name;
|
|
|
|
|
int namelen = dentry->d_name.len;
|
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-10-25 15:48:28 -04:00
|
|
|
int ret = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 10:38:47 -07:00
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
2007-12-12 14:38:19 -05:00
|
|
|
|
2017-01-20 14:54:07 +01:00
|
|
|
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
|
|
|
|
|
name, namelen, 0);
|
2018-09-12 06:06:26 +08:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
|
ret = di ? PTR_ERR(di) : -ENOENT;
|
2018-03-05 17:13:37 +08:00
|
|
|
goto out;
|
|
|
|
|
}
|
2009-01-05 21:25:51 -05:00
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
|
2017-10-30 11:14:38 -06:00
|
|
|
if (location->type != BTRFS_INODE_ITEM_KEY &&
|
|
|
|
|
location->type != BTRFS_ROOT_ITEM_KEY) {
|
2018-03-05 17:13:37 +08:00
|
|
|
ret = -EUCLEAN;
|
2017-10-30 11:14:38 -06:00
|
|
|
btrfs_warn(root->fs_info,
|
|
|
|
|
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
|
|
|
|
|
__func__, name, btrfs_ino(BTRFS_I(dir)),
|
|
|
|
|
location->objectid, location->type, location->offset);
|
|
|
|
|
}
|
2019-03-13 13:55:11 +08:00
|
|
|
if (!ret)
|
|
|
|
|
*type = btrfs_dir_type(path->nodes[0], di);
|
2007-06-12 06:35:45 -04:00
|
|
|
out:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* when we hit a tree root in a directory, the btrfs part of the inode
|
|
|
|
|
* needs to be changed to reflect the root directory of the tree root. This
|
|
|
|
|
* is kind of like crossing a mount point.
|
|
|
|
|
*/
|
2016-06-22 18:54:24 -04:00
|
|
|
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
|
2009-09-21 15:56:00 -04:00
|
|
|
struct inode *dir,
|
|
|
|
|
struct dentry *dentry,
|
|
|
|
|
struct btrfs_key *location,
|
|
|
|
|
struct btrfs_root **sub_root)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2009-09-21 15:56:00 -04:00
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct btrfs_root *new_root;
|
|
|
|
|
struct btrfs_root_ref *ref;
|
|
|
|
|
struct extent_buffer *leaf;
|
2015-01-02 19:36:14 +01:00
|
|
|
struct btrfs_key key;
|
2009-09-21 15:56:00 -04:00
|
|
|
int ret;
|
|
|
|
|
int err = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path) {
|
|
|
|
|
err = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
err = -ENOENT;
|
2015-01-02 19:36:14 +01:00
|
|
|
key.objectid = BTRFS_I(dir)->root->root_key.objectid;
|
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
|
key.offset = location->objectid;
|
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
|
2009-09-21 15:56:00 -04:00
|
|
|
if (ret) {
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
err = ret;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
|
2017-01-10 20:35:31 +02:00
|
|
|
if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
|
2009-09-21 15:56:00 -04:00
|
|
|
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
|
|
|
|
|
goto out;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
|
|
|
|
|
(unsigned long)(ref + 1),
|
|
|
|
|
dentry->d_name.len);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
|
2011-04-21 01:20:15 +02:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 15:56:00 -04:00
|
|
|
|
2020-05-15 19:35:55 +02:00
|
|
|
new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
|
2009-09-21 15:56:00 -04:00
|
|
|
if (IS_ERR(new_root)) {
|
|
|
|
|
err = PTR_ERR(new_root);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*sub_root = new_root;
|
|
|
|
|
location->objectid = btrfs_root_dirid(&new_root->root_item);
|
|
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
|
location->offset = 0;
|
|
|
|
|
err = 0;
|
|
|
|
|
out:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return err;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
static void inode_tree_add(struct inode *inode)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_inode *entry;
|
2009-08-21 10:09:44 +02:00
|
|
|
struct rb_node **p;
|
|
|
|
|
struct rb_node *parent;
|
2013-09-02 12:19:13 +01:00
|
|
|
struct rb_node *new = &BTRFS_I(inode)->rb_node;
|
2017-01-10 20:35:31 +02:00
|
|
|
u64 ino = btrfs_ino(BTRFS_I(inode));
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
|
2010-10-23 15:19:20 -04:00
|
|
|
if (inode_unhashed(inode))
|
2009-09-21 16:00:26 -04:00
|
|
|
return;
|
2013-05-15 07:48:16 +00:00
|
|
|
parent = NULL;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
spin_lock(&root->inode_lock);
|
2013-05-15 07:48:16 +00:00
|
|
|
p = &root->inode_tree.rb_node;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
while (*p) {
|
|
|
|
|
parent = *p;
|
|
|
|
|
entry = rb_entry(parent, struct btrfs_inode, rb_node);
|
|
|
|
|
|
2018-06-29 10:56:40 +02:00
|
|
|
if (ino < btrfs_ino(entry))
|
2009-08-21 10:09:44 +02:00
|
|
|
p = &parent->rb_left;
|
2018-06-29 10:56:40 +02:00
|
|
|
else if (ino > btrfs_ino(entry))
|
2009-08-21 10:09:44 +02:00
|
|
|
p = &parent->rb_right;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
else {
|
|
|
|
|
WARN_ON(!(entry->vfs_inode.i_state &
|
2010-06-02 17:38:30 -04:00
|
|
|
(I_WILL_FREE | I_FREEING)));
|
2013-09-02 12:19:13 +01:00
|
|
|
rb_replace_node(parent, new, &root->inode_tree);
|
2009-08-21 10:09:44 +02:00
|
|
|
RB_CLEAR_NODE(parent);
|
|
|
|
|
spin_unlock(&root->inode_lock);
|
2013-09-02 12:19:13 +01:00
|
|
|
return;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
}
|
|
|
|
|
}
|
2013-09-02 12:19:13 +01:00
|
|
|
rb_link_node(new, parent, p);
|
|
|
|
|
rb_insert_color(new, &root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-31 14:42:38 +03:00
|
|
|
static void inode_tree_del(struct btrfs_inode *inode)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
{
|
2020-08-31 14:42:38 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
2009-09-21 16:00:26 -04:00
|
|
|
int empty = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
|
2009-08-21 10:09:44 +02:00
|
|
|
spin_lock(&root->inode_lock);
|
2020-08-31 14:42:38 +03:00
|
|
|
if (!RB_EMPTY_NODE(&inode->rb_node)) {
|
|
|
|
|
rb_erase(&inode->rb_node, &root->inode_tree);
|
|
|
|
|
RB_CLEAR_NODE(&inode->rb_node);
|
2009-09-21 16:00:26 -04:00
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
}
|
2009-08-21 10:09:44 +02:00
|
|
|
spin_unlock(&root->inode_lock);
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2013-09-05 16:58:43 +02:00
|
|
|
if (empty && btrfs_root_refs(&root->root_item) == 0) {
|
2009-09-21 16:00:26 -04:00
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
|
if (empty)
|
|
|
|
|
btrfs_add_dead_root(root);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
|
2008-09-05 16:13:11 -04:00
|
|
|
static int btrfs_init_locked_inode(struct inode *inode, void *p)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_iget_args *args = p;
|
2020-05-15 19:35:59 +02:00
|
|
|
|
|
|
|
|
inode->i_ino = args->ino;
|
|
|
|
|
BTRFS_I(inode)->location.objectid = args->ino;
|
|
|
|
|
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
|
BTRFS_I(inode)->location.offset = 0;
|
2020-02-14 16:11:43 -05:00
|
|
|
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
|
|
|
|
|
BUG_ON(args->root && !BTRFS_I(inode)->root);
|
2007-06-12 06:35:45 -04:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int btrfs_find_actor(struct inode *inode, void *opaque)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_iget_args *args = opaque;
|
2020-05-15 19:35:59 +02:00
|
|
|
|
|
|
|
|
return args->ino == BTRFS_I(inode)->location.objectid &&
|
2009-01-05 21:25:51 -05:00
|
|
|
args->root == BTRFS_I(inode)->root;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2020-05-15 19:35:59 +02:00
|
|
|
static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
struct btrfs_root *root)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
struct btrfs_iget_args args;
|
2020-05-15 19:35:59 +02:00
|
|
|
unsigned long hashval = btrfs_inode_hash(ino, root);
|
Btrfs: improve inode hash function/inode lookup
Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:
1) When we have many subvolumes. Each subvolume has its own btree
where its files and directories are added to, and each has its
own objectid (inode number) namespace. This means that if we have
N subvolumes, and all have inode number X associated to a file or
directory, the corresponding inodes all map to the same hash table
entry, resulting in a bucket (hlist_head list) with N elements;
2) On 32 bits machines. Th VFS hash values are unsigned longs, which
are 32 bits wide on 32 bits machines, and the inode (objectid)
numbers are 64 bits unsigned integers. We simply cast the inode
numbers to hash values, which means that for all inodes with the
same 32 bits lower half, the same hash bucket is used for all of
them. For example, all inodes with a number (objectid) between
0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
the same hash table bucket.
This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-10-06 22:22:33 +01:00
|
|
|
|
2020-05-15 19:35:59 +02:00
|
|
|
args.ino = ino;
|
2007-06-12 06:35:45 -04:00
|
|
|
args.root = root;
|
|
|
|
|
|
Btrfs: improve inode hash function/inode lookup
Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:
1) When we have many subvolumes. Each subvolume has its own btree
where its files and directories are added to, and each has its
own objectid (inode number) namespace. This means that if we have
N subvolumes, and all have inode number X associated to a file or
directory, the corresponding inodes all map to the same hash table
entry, resulting in a bucket (hlist_head list) with N elements;
2) On 32 bits machines. Th VFS hash values are unsigned longs, which
are 32 bits wide on 32 bits machines, and the inode (objectid)
numbers are 64 bits unsigned integers. We simply cast the inode
numbers to hash values, which means that for all inodes with the
same 32 bits lower half, the same hash bucket is used for all of
them. For example, all inodes with a number (objectid) between
0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
the same hash table bucket.
This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-10-06 22:22:33 +01:00
|
|
|
inode = iget5_locked(s, hashval, btrfs_find_actor,
|
2007-06-12 06:35:45 -04:00
|
|
|
btrfs_init_locked_inode,
|
|
|
|
|
(void *)&args);
|
|
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-03 19:09:35 +02:00
|
|
|
/*
|
2020-05-15 19:35:59 +02:00
|
|
|
* Get an inode object given its inode number and corresponding root.
|
2019-10-03 19:09:35 +02:00
|
|
|
* Path can be preallocated to prevent recursing back to iget through
|
|
|
|
|
* allocator. NULL is also valid but may require an additional allocation
|
|
|
|
|
* later.
|
2008-07-21 02:01:04 +05:30
|
|
|
*/
|
2020-05-15 19:35:59 +02:00
|
|
|
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
|
2019-10-03 19:09:35 +02:00
|
|
|
struct btrfs_root *root, struct btrfs_path *path)
|
2008-07-21 02:01:04 +05:30
|
|
|
{
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
2020-05-15 19:35:59 +02:00
|
|
|
inode = btrfs_iget_locked(s, ino, root);
|
2008-07-21 02:01:04 +05:30
|
|
|
if (!inode)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2008-07-21 02:01:04 +05:30
|
|
|
|
|
|
|
|
if (inode->i_state & I_NEW) {
|
2016-06-06 11:51:25 +01:00
|
|
|
int ret;
|
|
|
|
|
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
ret = btrfs_read_locked_inode(inode, path);
|
2018-07-29 23:04:50 +01:00
|
|
|
if (!ret) {
|
2011-07-12 11:25:31 -07:00
|
|
|
inode_tree_add(inode);
|
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
|
} else {
|
2018-07-29 23:04:51 +01:00
|
|
|
iget_failed(inode);
|
|
|
|
|
/*
|
|
|
|
|
* ret > 0 can come from btrfs_search_slot called by
|
|
|
|
|
* btrfs_read_locked_inode, this means the inode item
|
|
|
|
|
* was not found.
|
|
|
|
|
*/
|
|
|
|
|
if (ret > 0)
|
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
inode = ERR_PTR(ret);
|
2011-07-12 11:25:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2008-07-21 02:01:04 +05:30
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-15 19:35:59 +02:00
|
|
|
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
{
|
2020-05-15 19:35:59 +02:00
|
|
|
return btrfs_iget_path(s, ino, root, NULL);
|
Btrfs: fix deadlock on tree root leaf when finding free extent
When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:
schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]
At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.
So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.
Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-24 10:13:03 +01:00
|
|
|
}
|
|
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
static struct inode *new_simple_dir(struct super_block *s,
|
|
|
|
|
struct btrfs_key *key,
|
|
|
|
|
struct btrfs_root *root)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode = new_inode(s);
|
|
|
|
|
|
|
|
|
|
if (!inode)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
2020-02-14 16:11:43 -05:00
|
|
|
BTRFS_I(inode)->root = btrfs_grab_root(root);
|
2009-09-21 15:56:00 -04:00
|
|
|
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
|
2012-05-23 14:13:11 -04:00
|
|
|
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
|
2009-09-21 15:56:00 -04:00
|
|
|
|
|
|
|
|
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
|
2019-12-05 10:36:04 -08:00
|
|
|
/*
|
|
|
|
|
* We only need lookup, the rest is read-only and there's no inode
|
|
|
|
|
* associated with the dentry
|
|
|
|
|
*/
|
|
|
|
|
inode->i_op = &simple_dir_inode_operations;
|
2017-01-25 17:06:39 -08:00
|
|
|
inode->i_opflags &= ~IOP_XATTR;
|
2009-09-21 15:56:00 -04:00
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
|
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
|
2016-09-14 07:48:06 -07:00
|
|
|
inode->i_mtime = current_time(inode);
|
2012-07-04 12:48:07 +05:30
|
|
|
inode->i_atime = inode->i_mtime;
|
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
2018-06-21 18:04:06 +02:00
|
|
|
BTRFS_I(inode)->i_otime = inode->i_mtime;
|
2009-09-21 15:56:00 -04:00
|
|
|
|
|
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-01 15:42:07 +01:00
|
|
|
static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
|
|
|
|
|
static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
|
|
|
|
|
static_assert(BTRFS_FT_DIR == FT_DIR);
|
|
|
|
|
static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
|
|
|
|
|
static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
|
|
|
|
|
static_assert(BTRFS_FT_FIFO == FT_FIFO);
|
|
|
|
|
static_assert(BTRFS_FT_SOCK == FT_SOCK);
|
|
|
|
|
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
|
|
|
|
|
|
2019-03-13 13:55:11 +08:00
|
|
|
static inline u8 btrfs_inode_type(struct inode *inode)
|
|
|
|
|
{
|
|
|
|
|
return fs_umode_to_ftype(inode->i_mode);
|
|
|
|
|
}
|
|
|
|
|
|
2008-11-17 21:02:50 -05:00
|
|
|
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2009-01-05 21:25:51 -05:00
|
|
|
struct inode *inode;
|
2009-09-21 15:56:00 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *sub_root = root;
|
|
|
|
|
struct btrfs_key location;
|
2019-03-13 13:55:11 +08:00
|
|
|
u8 di_type = 0;
|
2011-06-28 16:18:59 -04:00
|
|
|
int ret = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
if (dentry->d_name.len > BTRFS_NAME_LEN)
|
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2019-03-13 13:55:11 +08:00
|
|
|
ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (ret < 0)
|
|
|
|
|
return ERR_PTR(ret);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
if (location.type == BTRFS_INODE_ITEM_KEY) {
|
2020-05-15 19:35:59 +02:00
|
|
|
inode = btrfs_iget(dir->i_sb, location.objectid, root);
|
2019-03-13 13:55:11 +08:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
|
return inode;
|
|
|
|
|
|
|
|
|
|
/* Do extra check against inode mode with di_type */
|
|
|
|
|
if (btrfs_inode_type(inode) != di_type) {
|
|
|
|
|
btrfs_crit(fs_info,
|
|
|
|
|
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
|
|
|
|
|
inode->i_mode, btrfs_inode_type(inode),
|
|
|
|
|
di_type);
|
|
|
|
|
iput(inode);
|
|
|
|
|
return ERR_PTR(-EUCLEAN);
|
|
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
ret = fixup_tree_root_location(fs_info, dir, dentry,
|
2009-09-21 15:56:00 -04:00
|
|
|
&location, &sub_root);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
if (ret != -ENOENT)
|
|
|
|
|
inode = ERR_PTR(ret);
|
|
|
|
|
else
|
2022-07-14 13:48:10 +03:00
|
|
|
inode = new_simple_dir(dir->i_sb, &location, root);
|
2009-09-21 15:56:00 -04:00
|
|
|
} else {
|
2020-05-15 19:35:59 +02:00
|
|
|
inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
|
2020-01-24 09:33:01 -05:00
|
|
|
btrfs_put_root(sub_root);
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2022-07-14 13:48:10 +03:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
|
return inode;
|
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
down_read(&fs_info->cleanup_work_sem);
|
2017-07-17 08:45:34 +01:00
|
|
|
if (!sb_rdonly(inode->i_sb))
|
2011-01-31 16:22:42 -05:00
|
|
|
ret = btrfs_orphan_cleanup(sub_root);
|
2016-06-22 18:54:23 -04:00
|
|
|
up_read(&fs_info->cleanup_work_sem);
|
2013-06-03 21:39:49 -04:00
|
|
|
if (ret) {
|
|
|
|
|
iput(inode);
|
2011-01-31 16:22:42 -05:00
|
|
|
inode = ERR_PTR(ret);
|
2013-06-03 21:39:49 -04:00
|
|
|
}
|
2009-11-12 09:34:40 +00:00
|
|
|
}
|
|
|
|
|
|
2008-11-17 21:02:50 -05:00
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2011-01-07 17:49:23 +11:00
|
|
|
static int btrfs_dentry_delete(const struct dentry *dentry)
|
2009-09-21 16:00:26 -04:00
|
|
|
{
|
|
|
|
|
struct btrfs_root *root;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2012-02-21 17:04:28 +08:00
|
|
|
if (!inode && !IS_ROOT(dentry))
|
2015-03-17 22:25:59 +00:00
|
|
|
inode = d_inode(dentry->d_parent);
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2012-02-21 17:04:28 +08:00
|
|
|
if (inode) {
|
|
|
|
|
root = BTRFS_I(inode)->root;
|
2009-10-09 09:25:16 -04:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
|
|
|
|
return 1;
|
2012-02-21 17:04:28 +08:00
|
|
|
|
2017-01-10 20:35:31 +02:00
|
|
|
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2012-02-21 17:04:28 +08:00
|
|
|
return 1;
|
2009-10-09 09:25:16 -04:00
|
|
|
}
|
2009-09-21 16:00:26 -04:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2008-11-17 21:02:50 -05:00
|
|
|
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
|
2012-06-10 17:13:09 -04:00
|
|
|
unsigned int flags)
|
2008-11-17 21:02:50 -05:00
|
|
|
{
|
2018-10-10 16:38:27 -04:00
|
|
|
struct inode *inode = btrfs_lookup_dentry(dir, dentry);
|
2013-12-13 09:51:42 +09:00
|
|
|
|
2018-10-10 16:38:27 -04:00
|
|
|
if (inode == ERR_PTR(-ENOENT))
|
|
|
|
|
inode = NULL;
|
2014-10-12 22:24:21 -04:00
|
|
|
return d_splice_alias(inode, dentry);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2017-07-24 15:14:25 -04:00
|
|
|
/*
|
|
|
|
|
* All this infrastructure exists because dir_emit can fault, and we are holding
|
|
|
|
|
* the tree lock when doing readdir. For now just allocate a buffer and copy
|
|
|
|
|
* our information into that, and then dir_emit from the buffer. This is
|
|
|
|
|
* similar to what NFS does, only we don't keep the buffer around in pagecache
|
|
|
|
|
* because I'm afraid I'll mess that up. Long term we need to make filldir do
|
|
|
|
|
* copy_to_user_inatomic so we don't have to worry about page faulting under the
|
|
|
|
|
* tree lock.
|
|
|
|
|
*/
|
|
|
|
|
static int btrfs_opendir(struct inode *inode, struct file *file)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_file_private *private;
|
|
|
|
|
|
|
|
|
|
private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
|
|
|
|
|
if (!private)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
|
|
|
|
|
if (!private->filldir_buf) {
|
|
|
|
|
kfree(private);
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
file->private_data = private;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct dir_entry {
|
|
|
|
|
u64 ino;
|
|
|
|
|
u64 offset;
|
|
|
|
|
unsigned type;
|
|
|
|
|
int name_len;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
|
|
|
|
|
{
|
|
|
|
|
while (entries--) {
|
|
|
|
|
struct dir_entry *entry = addr;
|
|
|
|
|
char *name = (char *)(entry + 1);
|
|
|
|
|
|
2018-04-16 21:10:14 +02:00
|
|
|
ctx->pos = get_unaligned(&entry->offset);
|
|
|
|
|
if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
|
|
|
|
|
get_unaligned(&entry->ino),
|
|
|
|
|
get_unaligned(&entry->type)))
|
2017-07-24 15:14:25 -04:00
|
|
|
return 1;
|
2018-04-16 21:10:14 +02:00
|
|
|
addr += sizeof(struct dir_entry) +
|
|
|
|
|
get_unaligned(&entry->name_len);
|
2017-07-24 15:14:25 -04:00
|
|
|
ctx->pos++;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2013-05-22 16:48:09 -04:00
|
|
|
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2013-05-22 16:48:09 -04:00
|
|
|
struct inode *inode = file_inode(file);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2017-07-24 15:14:25 -04:00
|
|
|
struct btrfs_file_private *private = file->private_data;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
|
struct btrfs_key key;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct btrfs_key found_key;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_path *path;
|
2017-07-24 15:14:25 -04:00
|
|
|
void *addr;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
struct list_head ins_list;
|
|
|
|
|
struct list_head del_list;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret;
|
2007-10-15 16:14:19 -04:00
|
|
|
char *name_ptr;
|
|
|
|
|
int name_len;
|
2017-07-24 15:14:25 -04:00
|
|
|
int entries = 0;
|
|
|
|
|
int total_len = 0;
|
2016-05-20 13:50:33 -07:00
|
|
|
bool put = false;
|
2016-11-21 15:59:04 +01:00
|
|
|
struct btrfs_key location;
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2013-05-22 16:48:09 -04:00
|
|
|
if (!dir_emit_dots(file, ctx))
|
|
|
|
|
return 0;
|
|
|
|
|
|
2008-08-17 17:08:36 +01:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
2011-05-28 07:00:39 -04:00
|
|
|
|
2017-07-24 15:14:25 -04:00
|
|
|
addr = private->filldir_buf;
|
2015-11-27 16:31:35 +01:00
|
|
|
path->reada = READA_FORWARD;
|
2008-08-17 17:08:36 +01:00
|
|
|
|
2016-11-21 15:59:04 +01:00
|
|
|
INIT_LIST_HEAD(&ins_list);
|
|
|
|
|
INIT_LIST_HEAD(&del_list);
|
|
|
|
|
put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
|
2017-07-24 15:14:25 -04:00
|
|
|
again:
|
2016-11-21 15:59:04 +01:00
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
2013-05-22 16:48:09 -04:00
|
|
|
key.offset = ctx->pos;
|
2017-01-10 20:35:31 +02:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2022-03-09 14:50:42 +01:00
|
|
|
btrfs_for_each_slot(root, &key, &found_key, path, ret) {
|
2017-07-24 15:14:25 -04:00
|
|
|
struct dir_entry *entry;
|
2022-03-09 14:50:42 +01:00
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
2007-10-15 16:14:19 -04:00
|
|
|
|
|
|
|
|
if (found_key.objectid != key.objectid)
|
2007-06-12 06:35:45 -04:00
|
|
|
break;
|
2016-11-21 15:59:04 +01:00
|
|
|
if (found_key.type != BTRFS_DIR_INDEX_KEY)
|
2007-06-12 06:35:45 -04:00
|
|
|
break;
|
2013-05-22 16:48:09 -04:00
|
|
|
if (found_key.offset < ctx->pos)
|
2022-03-09 14:50:42 +01:00
|
|
|
continue;
|
2016-11-21 15:59:04 +01:00
|
|
|
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
|
2022-03-09 14:50:42 +01:00
|
|
|
continue;
|
|
|
|
|
di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
|
2016-11-21 15:59:04 +01:00
|
|
|
name_len = btrfs_dir_name_len(leaf, di);
|
2017-07-24 15:14:25 -04:00
|
|
|
if ((total_len + sizeof(struct dir_entry) + name_len) >=
|
|
|
|
|
PAGE_SIZE) {
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto nopos;
|
|
|
|
|
addr = private->filldir_buf;
|
|
|
|
|
entries = 0;
|
|
|
|
|
total_len = 0;
|
|
|
|
|
goto again;
|
2016-11-21 15:59:04 +01:00
|
|
|
}
|
2017-07-24 15:14:25 -04:00
|
|
|
|
|
|
|
|
entry = addr;
|
2018-04-16 21:10:14 +02:00
|
|
|
put_unaligned(name_len, &entry->name_len);
|
2017-07-24 15:14:25 -04:00
|
|
|
name_ptr = (char *)(entry + 1);
|
2016-11-21 15:59:04 +01:00
|
|
|
read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
|
|
|
|
|
name_len);
|
2019-03-26 21:39:34 +00:00
|
|
|
put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
|
2018-04-16 21:10:14 +02:00
|
|
|
&entry->type);
|
2016-11-21 15:59:04 +01:00
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &location);
|
2018-04-16 21:10:14 +02:00
|
|
|
put_unaligned(location.objectid, &entry->ino);
|
|
|
|
|
put_unaligned(found_key.offset, &entry->offset);
|
2017-07-24 15:14:25 -04:00
|
|
|
entries++;
|
|
|
|
|
addr += sizeof(struct dir_entry) + name_len;
|
|
|
|
|
total_len += sizeof(struct dir_entry) + name_len;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
2022-03-09 14:50:42 +01:00
|
|
|
/* Catch error encountered during iteration */
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
goto err;
|
|
|
|
|
|
2017-07-24 15:14:25 -04:00
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
|
|
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto nopos;
|
2008-08-17 17:08:36 +01:00
|
|
|
|
2016-11-05 13:26:35 -04:00
|
|
|
ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
|
2016-11-21 15:59:04 +01:00
|
|
|
if (ret)
|
btrfs: properly set the termination value of ctx->pos in readdir
The value of ctx->pos in the last readdir call is supposed to be set to
INT_MAX due to 32bit compatibility, unless 'pos' is intentially set to a
larger value, then it's LLONG_MAX.
There's a report from PaX SIZE_OVERFLOW plugin that "ctx->pos++"
overflows (https://forums.grsecurity.net/viewtopic.php?f=1&t=4284), on a
64bit arch, where the value is 0x7fffffffffffffff ie. LLONG_MAX before
the increment.
We can get to that situation like that:
* emit all regular readdir entries
* still in the same call to readdir, bump the last pos to INT_MAX
* next call to readdir will not emit any entries, but will reach the
bump code again, finds pos to be INT_MAX and sets it to LLONG_MAX
Normally this is not a problem, but if we call readdir again, we'll find
'pos' set to LLONG_MAX and the unconditional increment will overflow.
The report from Victor at
(http://thread.gmane.org/gmane.comp.file-systems.btrfs/49500) with debugging
print shows that pattern:
Overflow: e
Overflow: 7fffffff
Overflow: 7fffffffffffffff
PAX: size overflow detected in function btrfs_real_readdir
fs/btrfs/inode.c:5760 cicus.935_282 max, count: 9, decl: pos; num: 0;
context: dir_context;
CPU: 0 PID: 2630 Comm: polkitd Not tainted 4.2.3-grsec #1
Hardware name: Gigabyte Technology Co., Ltd. H81ND2H/H81ND2H, BIOS F3 08/11/2015
ffffffff81901608 0000000000000000 ffffffff819015e6 ffffc90004973d48
ffffffff81742f0f 0000000000000007 ffffffff81901608 ffffc90004973d78
ffffffff811cb706 0000000000000000 ffff8800d47359e0 ffffc90004973ed8
Call Trace:
[<ffffffff81742f0f>] dump_stack+0x4c/0x7f
[<ffffffff811cb706>] report_size_overflow+0x36/0x40
[<ffffffff812ef0bc>] btrfs_real_readdir+0x69c/0x6d0
[<ffffffff811dafc8>] iterate_dir+0xa8/0x150
[<ffffffff811e6d8d>] ? __fget_light+0x2d/0x70
[<ffffffff811dba3a>] SyS_getdents+0xba/0x1c0
Overflow: 1a
[<ffffffff811db070>] ? iterate_dir+0x150/0x150
[<ffffffff81749b69>] entry_SYSCALL_64_fastpath+0x12/0x83
The jump from 7fffffff to 7fffffffffffffff happens when new dir entries
are not yet synced and are processed from the delayed list. Then the code
could go to the bump section again even though it might not emit any new
dir entries from the delayed list.
The fix avoids entering the "bump" section again once we've finished
emitting the entries, both for synced and delayed entries.
References: https://forums.grsecurity.net/viewtopic.php?f=1&t=4284
Reported-by: Victor <services@swwu.com>
CC: stable@vger.kernel.org
Signed-off-by: David Sterba <dsterba@suse.com>
Tested-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-13 13:44:28 +01:00
|
|
|
goto nopos;
|
|
|
|
|
|
2013-07-11 16:19:42 -07:00
|
|
|
/*
|
|
|
|
|
* Stop new entries from being returned after we return the last
|
|
|
|
|
* entry.
|
|
|
|
|
*
|
|
|
|
|
* New directory entries are assigned a strictly increasing
|
|
|
|
|
* offset. This means that new entries created during readdir
|
|
|
|
|
* are *guaranteed* to be seen in the future by that readdir.
|
|
|
|
|
* This has broken buggy programs which operate on names as
|
|
|
|
|
* they're returned by readdir. Until we re-use freed offsets
|
|
|
|
|
* we have this hack to stop new entries from being returned
|
|
|
|
|
* under the assumption that they'll never reach this huge
|
|
|
|
|
* offset.
|
|
|
|
|
*
|
|
|
|
|
* This is being careful not to overflow 32bit loff_t unless the
|
|
|
|
|
* last entry requires it because doing so has broken 32bit apps
|
|
|
|
|
* in the past.
|
|
|
|
|
*/
|
2016-11-21 15:59:04 +01:00
|
|
|
if (ctx->pos >= INT_MAX)
|
|
|
|
|
ctx->pos = LLONG_MAX;
|
|
|
|
|
else
|
|
|
|
|
ctx->pos = INT_MAX;
|
2007-06-12 06:35:45 -04:00
|
|
|
nopos:
|
|
|
|
|
ret = 0;
|
|
|
|
|
err:
|
2016-05-20 13:50:33 -07:00
|
|
|
if (put)
|
|
|
|
|
btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
|
2007-06-12 06:35:45 -04:00
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2007-06-22 14:16:25 -04:00
|
|
|
* This is somewhat expensive, updating the tree every time the
|
2007-06-12 06:35:45 -04:00
|
|
|
* inode changes. But, it is most likely to find the inode in cache.
|
|
|
|
|
* FIXME, needs more benchmarking...there are no reasons other than performance
|
|
|
|
|
* to keep or drop this code.
|
|
|
|
|
*/
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_dirty_inode(struct inode *inode)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-16 10:49:58 -04:00
|
|
|
int ret;
|
|
|
|
|
|
2012-05-23 14:13:11 -04:00
|
|
|
if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
|
2011-11-30 10:45:38 -05:00
|
|
|
return 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2011-04-13 12:54:33 -04:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-11-30 10:45:38 -05:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
|
return PTR_ERR(trans);
|
2010-05-16 10:49:58 -04:00
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2021-02-22 18:40:44 +02:00
|
|
|
if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
|
2010-05-26 11:02:00 -04:00
|
|
|
/* whoops, lets try again with the full transaction */
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2010-05-26 11:02:00 -04:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2011-11-30 10:45:38 -05:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
|
return PTR_ERR(trans);
|
2010-05-16 10:49:58 -04:00
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2010-05-26 11:02:00 -04:00
|
|
|
}
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (BTRFS_I(inode)->delayed_node)
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_balance_delayed_items(fs_info);
|
2011-11-30 10:45:38 -05:00
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This is a copy of file_update_time. We need this so we can return error on
|
|
|
|
|
* ENOSPC for updating the inode in the case of file write and mmap writes.
|
|
|
|
|
*/
|
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
|
|
|
static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
|
2012-03-26 09:46:47 -04:00
|
|
|
int flags)
|
2011-11-30 10:45:38 -05:00
|
|
|
{
|
2012-06-15 09:49:33 +02:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2017-12-11 06:35:24 -05:00
|
|
|
bool dirty = flags & ~S_VERSION;
|
2012-06-15 09:49:33 +02:00
|
|
|
|
|
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
|
return -EROFS;
|
|
|
|
|
|
2012-03-26 09:46:47 -04:00
|
|
|
if (flags & S_VERSION)
|
2017-12-11 06:35:24 -05:00
|
|
|
dirty |= inode_maybe_inc_iversion(inode, dirty);
|
2012-03-26 09:46:47 -04:00
|
|
|
if (flags & S_CTIME)
|
|
|
|
|
inode->i_ctime = *now;
|
|
|
|
|
if (flags & S_MTIME)
|
|
|
|
|
inode->i_mtime = *now;
|
|
|
|
|
if (flags & S_ATIME)
|
|
|
|
|
inode->i_atime = *now;
|
2017-12-11 06:35:24 -05:00
|
|
|
return dirty ? btrfs_dirty_inode(inode) : 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* find the highest existing sequence number in a directory
|
|
|
|
|
* and then set the in-memory index_cnt variable to reflect
|
|
|
|
|
* free sequence numbers
|
|
|
|
|
*/
|
2017-02-20 13:50:32 +02:00
|
|
|
static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
|
2008-07-24 12:12:38 -04:00
|
|
|
{
|
2017-02-20 13:50:32 +02:00
|
|
|
struct btrfs_root *root = inode->root;
|
2008-07-24 12:12:38 -04:00
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2017-02-20 13:50:32 +02:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2014-06-04 18:41:45 +02:00
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
2008-07-24 12:12:38 -04:00
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
goto out;
|
|
|
|
|
/* FIXME: we should be able to handle this */
|
|
|
|
|
if (ret == 0)
|
|
|
|
|
goto out;
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
|
|
if (path->slots[0] == 0) {
|
2021-12-15 12:19:59 +00:00
|
|
|
inode->index_cnt = BTRFS_DIR_START_INDEX;
|
2008-07-24 12:12:38 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
2017-02-20 13:50:32 +02:00
|
|
|
if (found_key.objectid != btrfs_ino(inode) ||
|
2014-06-04 18:41:45 +02:00
|
|
|
found_key.type != BTRFS_DIR_INDEX_KEY) {
|
2021-12-15 12:19:59 +00:00
|
|
|
inode->index_cnt = BTRFS_DIR_START_INDEX;
|
2008-07-24 12:12:38 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 13:50:32 +02:00
|
|
|
inode->index_cnt = found_key.offset + 1;
|
2008-07-24 12:12:38 -04:00
|
|
|
out:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* helper to find a free sequence number in a given directory. This current
|
|
|
|
|
* code is very simple, later versions will do smarter things in the btree
|
|
|
|
|
*/
|
2017-02-20 13:50:33 +02:00
|
|
|
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
|
2008-07-24 12:12:38 -04:00
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
2017-02-20 13:50:33 +02:00
|
|
|
if (dir->index_cnt == (u64)-1) {
|
|
|
|
|
ret = btrfs_inode_delayed_dir_index_count(dir);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (ret) {
|
|
|
|
|
ret = btrfs_set_inode_index_count(dir);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2008-07-24 12:12:38 -04:00
|
|
|
}
|
|
|
|
|
|
2017-02-20 13:50:33 +02:00
|
|
|
*index = dir->index_cnt;
|
|
|
|
|
dir->index_cnt++;
|
2008-07-24 12:12:38 -04:00
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2014-09-08 13:08:51 -07:00
|
|
|
static int btrfs_insert_inode_locked(struct inode *inode)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_iget_args args;
|
2020-05-15 19:35:59 +02:00
|
|
|
|
|
|
|
|
args.ino = BTRFS_I(inode)->location.objectid;
|
2014-09-08 13:08:51 -07:00
|
|
|
args.root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
|
|
return insert_inode_locked4(inode,
|
|
|
|
|
btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
|
|
|
|
|
btrfs_find_actor, &args);
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
|
|
|
|
|
unsigned int *trans_num_items)
|
|
|
|
|
{
|
|
|
|
|
struct inode *dir = args->dir;
|
|
|
|
|
struct inode *inode = args->inode;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
/* 1 to add inode item */
|
|
|
|
|
*trans_num_items = 1;
|
|
|
|
|
/* 1 to add compression property */
|
|
|
|
|
if (BTRFS_I(dir)->prop_compress)
|
|
|
|
|
(*trans_num_items)++;
|
|
|
|
|
/* 1 to add default ACL xattr */
|
|
|
|
|
if (args->default_acl)
|
|
|
|
|
(*trans_num_items)++;
|
|
|
|
|
/* 1 to add access ACL xattr */
|
|
|
|
|
if (args->acl)
|
|
|
|
|
(*trans_num_items)++;
|
|
|
|
|
#ifdef CONFIG_SECURITY
|
|
|
|
|
/* 1 to add LSM xattr */
|
|
|
|
|
if (dir->i_security)
|
|
|
|
|
(*trans_num_items)++;
|
|
|
|
|
#endif
|
|
|
|
|
if (args->orphan) {
|
|
|
|
|
/* 1 to add orphan item */
|
|
|
|
|
(*trans_num_items)++;
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* 1 to add dir item
|
|
|
|
|
* 1 to add dir index
|
|
|
|
|
* 1 to update parent inode item
|
2022-05-09 16:29:14 +01:00
|
|
|
*
|
|
|
|
|
* No need for 1 unit for the inode ref item because it is
|
|
|
|
|
* inserted in a batch together with the inode item at
|
|
|
|
|
* btrfs_create_new_inode().
|
2022-03-14 18:12:34 -07:00
|
|
|
*/
|
2022-05-09 16:29:14 +01:00
|
|
|
*trans_num_items += 3;
|
2022-03-14 18:12:34 -07:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
|
|
|
|
|
{
|
|
|
|
|
posix_acl_release(args->acl);
|
|
|
|
|
posix_acl_release(args->default_acl);
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-18 17:37:05 +08:00
|
|
|
/*
|
|
|
|
|
* Inherit flags from the parent inode.
|
|
|
|
|
*
|
|
|
|
|
* Currently only the compression flags and the cow flags are inherited.
|
|
|
|
|
*/
|
|
|
|
|
static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
|
|
|
|
|
{
|
|
|
|
|
unsigned int flags;
|
|
|
|
|
|
|
|
|
|
flags = BTRFS_I(dir)->flags;
|
|
|
|
|
|
|
|
|
|
if (flags & BTRFS_INODE_NOCOMPRESS) {
|
|
|
|
|
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
|
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
|
|
|
|
|
} else if (flags & BTRFS_INODE_COMPRESS) {
|
|
|
|
|
BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
|
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (flags & BTRFS_INODE_NODATACOW) {
|
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
|
|
|
|
|
if (S_ISREG(inode->i_mode))
|
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-26 18:40:21 +02:00
|
|
|
btrfs_sync_inode_flags_to_i_flags(inode);
|
2017-07-18 17:37:05 +08:00
|
|
|
}
|
|
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
struct btrfs_new_inode_args *args)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
struct inode *dir = args->dir;
|
2022-03-14 18:12:34 -07:00
|
|
|
struct inode *inode = args->inode;
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
const char *name = args->orphan ? NULL : args->dentry->d_name.name;
|
|
|
|
|
int name_len = args->orphan ? 0 : args->dentry->d_name.len;
|
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2022-03-14 18:12:34 -07:00
|
|
|
struct btrfs_root *root;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct btrfs_inode_item *inode_item;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_key *location;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct btrfs_path *path;
|
2022-03-09 17:31:41 -08:00
|
|
|
u64 objectid;
|
2008-01-29 15:15:18 -05:00
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
|
struct btrfs_key key[2];
|
|
|
|
|
u32 sizes[2];
|
btrfs: loop only once over data sizes array when inserting an item batch
When inserting a batch of items into a btree, we end up looping over the
data sizes array 3 times:
1) Once in the caller of btrfs_insert_empty_items(), when it populates the
array with the data sizes for each item;
2) Once at btrfs_insert_empty_items() to sum the elements of the data
sizes array and compute the total data size;
3) And then once again at setup_items_for_insert(), where we do exactly
the same as what we do at btrfs_insert_empty_items(), to compute the
total data size.
That is not bad for small arrays, but when the arrays have hundreds of
elements, the time spent on looping is not negligible. For example when
doing batch inserts of delayed items for dir index items or when logging
a directory, it's common to have 200 to 260 dir index items in a single
batch when using a leaf size of 16K and using file names between 8 and 12
characters. For a 64K leaf size, multiply that by 4. Taking into account
that during directory logging or when flushing delayed dir index items we
can have many of those large batches, the time spent on the looping adds
up quickly.
It's also more important to avoid it at setup_items_for_insert(), since
we are holding a write lock on a leaf and, in some cases, on upper nodes
of the btree, which causes us to block other tasks that want to access
the leaf and nodes for longer than necessary.
So change the code so that setup_items_for_insert() and
btrfs_insert_empty_items() no longer compute the total data size, and
instead rely on the caller to supply it. This makes us loop over the
array only once, where we can both populate the data size array and
compute the total data size, taking advantage of spatial and temporal
locality. To make this more manageable, use a structure to contain
all the relevant details for a batch of items (keys array, data sizes
array, total data size, number of items), and use it as an argument
for btrfs_insert_empty_items() and setup_items_for_insert().
This patch is part of a small patchset that is comprised of the following
patches:
btrfs: loop only once over data sizes array when inserting an item batch
btrfs: unexport setup_items_for_insert()
btrfs: use single bulk copy operations when logging directories
This is patch 1/3 and performance results, and the specific tests, are
included in the changelog of patch 3/3.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-24 12:28:13 +01:00
|
|
|
struct btrfs_item_batch batch;
|
2008-01-29 15:15:18 -05:00
|
|
|
unsigned long ptr;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret;
|
|
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 10:38:47 -07:00
|
|
|
if (!path)
|
2022-03-14 18:12:32 -07:00
|
|
|
return -ENOMEM;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
if (!args->subvol)
|
|
|
|
|
BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
|
|
|
|
|
root = BTRFS_I(inode)->root;
|
|
|
|
|
|
2022-03-09 17:31:41 -08:00
|
|
|
ret = btrfs_get_free_objectid(root, &objectid);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
inode->i_ino = objectid;
|
|
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (args->orphan) {
|
|
|
|
|
/*
|
|
|
|
|
* O_TMPFILE, set link count to 0, so that after this point, we
|
|
|
|
|
* fill in an inode item with the correct link count.
|
|
|
|
|
*/
|
|
|
|
|
set_nlink(inode, 0);
|
|
|
|
|
} else {
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
trace_btrfs_inode_request(dir);
|
|
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
2008-07-24 12:12:38 -04:00
|
|
|
}
|
2022-04-13 16:20:21 +01:00
|
|
|
/* index_cnt is ignored for everything but a dir. */
|
|
|
|
|
BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
|
2008-09-05 16:13:11 -04:00
|
|
|
BTRFS_I(inode)->generation = trans->transid;
|
2010-11-19 02:18:02 +00:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
2007-08-27 16:49:44 -04:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
/*
|
|
|
|
|
* Subvolumes don't inherit flags from their parent directory.
|
|
|
|
|
* Originally this was probably by accident, but we probably can't
|
|
|
|
|
* change it now without compatibility issues.
|
|
|
|
|
*/
|
|
|
|
|
if (!args->subvol)
|
|
|
|
|
btrfs_inherit_iflags(inode, dir);
|
2022-03-09 17:31:42 -08:00
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
if (S_ISREG(inode->i_mode)) {
|
2022-03-09 17:31:42 -08:00
|
|
|
if (btrfs_test_opt(fs_info, NODATASUM))
|
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
|
|
|
|
|
if (btrfs_test_opt(fs_info, NODATACOW))
|
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
|
|
|
|
|
BTRFS_INODE_NODATASUM;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
location = &BTRFS_I(inode)->location;
|
|
|
|
|
location->objectid = objectid;
|
|
|
|
|
location->offset = 0;
|
|
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_insert_inode_locked(inode);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
if (!args->orphan)
|
|
|
|
|
BTRFS_I(dir)->index_cnt--;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
/*
|
|
|
|
|
* We could have gotten an inode number from somebody who was fsynced
|
|
|
|
|
* and then removed in this same transaction, so let's just set full
|
|
|
|
|
* sync since it will be a full sync anyway and this will blow away the
|
|
|
|
|
* old info in the log.
|
|
|
|
|
*/
|
btrfs: reset last_reflink_trans after fsyncing inode
When an inode has a last_reflink_trans matching the current transaction,
we have to take special care when logging its checksums in order to
avoid getting checksum items with overlapping ranges in a log tree,
which could result in missing checksums after log replay (more on that
in the changelogs of commit 40e046acbd2f36 ("Btrfs: fix missing data
checksums after replaying a log tree") and commit e289f03ea79bbc ("btrfs:
fix corrupt log due to concurrent fsync of inodes with shared extents")).
We also need to make sure a full fsync will copy all old file extent
items it finds in modified leaves, because they might have been copied
from some other inode.
However once we fsync an inode, we don't need to keep paying the price of
that extra special care in future fsyncs done in the same transaction,
unless the inode is used for another reflink operation or the full sync
flag is set on it (truncate, failure to allocate extent maps for holes,
and other exceptional and infrequent cases).
So after we fsync an inode reset its last_unlink_trans to zero. In case
another reflink happens, we continue to update the last_reflink_trans of
the inode, just as before. Also set last_reflink_trans to the generation
of the last transaction that modified the inode whenever we need to set
the full sync flag on the inode, just like when we need to load an inode
from disk after eviction.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-02-17 12:12:06 +00:00
|
|
|
btrfs_set_inode_full_sync(BTRFS_I(inode));
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
|
2008-01-29 15:15:18 -05:00
|
|
|
key[0].objectid = objectid;
|
2014-06-04 18:41:45 +02:00
|
|
|
key[0].type = BTRFS_INODE_ITEM_KEY;
|
2008-01-29 15:15:18 -05:00
|
|
|
key[0].offset = 0;
|
|
|
|
|
|
|
|
|
|
sizes[0] = sizeof(struct btrfs_inode_item);
|
2014-04-27 20:40:45 +01:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (!args->orphan) {
|
2014-04-27 20:40:45 +01:00
|
|
|
/*
|
|
|
|
|
* Start new inodes with an inode_ref. This is slightly more
|
|
|
|
|
* efficient for small numbers of hard links since they will
|
|
|
|
|
* be packed into one item. Extended refs will kick in if we
|
|
|
|
|
* add more hard links than can fit in the ref item.
|
|
|
|
|
*/
|
|
|
|
|
key[1].objectid = objectid;
|
2014-06-04 18:41:45 +02:00
|
|
|
key[1].type = BTRFS_INODE_REF_KEY;
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (args->subvol) {
|
2022-03-09 17:31:40 -08:00
|
|
|
key[1].offset = objectid;
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
sizes[1] = 2 + sizeof(*ref);
|
|
|
|
|
} else {
|
|
|
|
|
key[1].offset = btrfs_ino(BTRFS_I(dir));
|
|
|
|
|
sizes[1] = name_len + sizeof(*ref);
|
|
|
|
|
}
|
2014-04-27 20:40:45 +01:00
|
|
|
}
|
2008-01-29 15:15:18 -05:00
|
|
|
|
btrfs: loop only once over data sizes array when inserting an item batch
When inserting a batch of items into a btree, we end up looping over the
data sizes array 3 times:
1) Once in the caller of btrfs_insert_empty_items(), when it populates the
array with the data sizes for each item;
2) Once at btrfs_insert_empty_items() to sum the elements of the data
sizes array and compute the total data size;
3) And then once again at setup_items_for_insert(), where we do exactly
the same as what we do at btrfs_insert_empty_items(), to compute the
total data size.
That is not bad for small arrays, but when the arrays have hundreds of
elements, the time spent on looping is not negligible. For example when
doing batch inserts of delayed items for dir index items or when logging
a directory, it's common to have 200 to 260 dir index items in a single
batch when using a leaf size of 16K and using file names between 8 and 12
characters. For a 64K leaf size, multiply that by 4. Taking into account
that during directory logging or when flushing delayed dir index items we
can have many of those large batches, the time spent on the looping adds
up quickly.
It's also more important to avoid it at setup_items_for_insert(), since
we are holding a write lock on a leaf and, in some cases, on upper nodes
of the btree, which causes us to block other tasks that want to access
the leaf and nodes for longer than necessary.
So change the code so that setup_items_for_insert() and
btrfs_insert_empty_items() no longer compute the total data size, and
instead rely on the caller to supply it. This makes us loop over the
array only once, where we can both populate the data size array and
compute the total data size, taking advantage of spatial and temporal
locality. To make this more manageable, use a structure to contain
all the relevant details for a batch of items (keys array, data sizes
array, total data size, number of items), and use it as an argument
for btrfs_insert_empty_items() and setup_items_for_insert().
This patch is part of a small patchset that is comprised of the following
patches:
btrfs: loop only once over data sizes array when inserting an item batch
btrfs: unexport setup_items_for_insert()
btrfs: use single bulk copy operations when logging directories
This is patch 1/3 and performance results, and the specific tests, are
included in the changelog of patch 3/3.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-24 12:28:13 +01:00
|
|
|
batch.keys = &key[0];
|
|
|
|
|
batch.data_sizes = &sizes[0];
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
|
|
|
|
|
batch.nr = args->orphan ? 1 : 2;
|
btrfs: loop only once over data sizes array when inserting an item batch
When inserting a batch of items into a btree, we end up looping over the
data sizes array 3 times:
1) Once in the caller of btrfs_insert_empty_items(), when it populates the
array with the data sizes for each item;
2) Once at btrfs_insert_empty_items() to sum the elements of the data
sizes array and compute the total data size;
3) And then once again at setup_items_for_insert(), where we do exactly
the same as what we do at btrfs_insert_empty_items(), to compute the
total data size.
That is not bad for small arrays, but when the arrays have hundreds of
elements, the time spent on looping is not negligible. For example when
doing batch inserts of delayed items for dir index items or when logging
a directory, it's common to have 200 to 260 dir index items in a single
batch when using a leaf size of 16K and using file names between 8 and 12
characters. For a 64K leaf size, multiply that by 4. Taking into account
that during directory logging or when flushing delayed dir index items we
can have many of those large batches, the time spent on the looping adds
up quickly.
It's also more important to avoid it at setup_items_for_insert(), since
we are holding a write lock on a leaf and, in some cases, on upper nodes
of the btree, which causes us to block other tasks that want to access
the leaf and nodes for longer than necessary.
So change the code so that setup_items_for_insert() and
btrfs_insert_empty_items() no longer compute the total data size, and
instead rely on the caller to supply it. This makes us loop over the
array only once, where we can both populate the data size array and
compute the total data size, taking advantage of spatial and temporal
locality. To make this more manageable, use a structure to contain
all the relevant details for a batch of items (keys array, data sizes
array, total data size, number of items), and use it as an argument
for btrfs_insert_empty_items() and setup_items_for_insert().
This patch is part of a small patchset that is comprised of the following
patches:
btrfs: loop only once over data sizes array when inserting an item batch
btrfs: unexport setup_items_for_insert()
btrfs: use single bulk copy operations when logging directories
This is patch 1/3 and performance results, and the specific tests, are
included in the changelog of patch 3/3.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-24 12:28:13 +01:00
|
|
|
ret = btrfs_insert_empty_items(trans, root, path, &batch);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (ret != 0) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto discard;
|
|
|
|
|
}
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2016-09-14 07:48:06 -07:00
|
|
|
inode->i_mtime = current_time(inode);
|
2012-07-04 12:48:07 +05:30
|
|
|
inode->i_atime = inode->i_mtime;
|
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
2018-06-21 18:04:06 +02:00
|
|
|
BTRFS_I(inode)->i_otime = inode->i_mtime;
|
2012-07-04 12:48:07 +05:30
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
/*
|
|
|
|
|
* We're going to fill the inode item now, so at this point the inode
|
|
|
|
|
* must be fully initialized.
|
|
|
|
|
*/
|
|
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
|
struct btrfs_inode_item);
|
2016-11-08 18:09:03 +01:00
|
|
|
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
|
2012-07-10 00:58:58 -06:00
|
|
|
sizeof(*inode_item));
|
2008-09-05 16:13:11 -04:00
|
|
|
fill_inode_item(trans, path->nodes[0], inode_item, inode);
|
2008-01-29 15:15:18 -05:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (!args->orphan) {
|
2014-04-27 20:40:45 +01:00
|
|
|
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
|
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
|
ptr = (unsigned long)(ref + 1);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (args->subvol) {
|
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
|
|
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
|
|
|
|
|
write_extent_buffer(path->nodes[0], "..", ptr, 2);
|
|
|
|
|
} else {
|
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
|
|
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref,
|
|
|
|
|
BTRFS_I(inode)->dir_index);
|
|
|
|
|
write_extent_buffer(path->nodes[0], name, ptr, name_len);
|
|
|
|
|
}
|
2014-04-27 20:40:45 +01:00
|
|
|
}
|
2008-01-29 15:15:18 -05:00
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
2022-05-31 16:06:33 +01:00
|
|
|
/*
|
|
|
|
|
* We don't need the path anymore, plus inheriting properties, adding
|
|
|
|
|
* ACLs, security xattrs, orphan item or adding the link, will result in
|
|
|
|
|
* allocating yet another path. So just free our path.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
path = NULL;
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2022-04-08 13:15:07 -04:00
|
|
|
if (args->subvol) {
|
|
|
|
|
struct inode *parent;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Subvolumes inherit properties from their parent subvolume,
|
|
|
|
|
* not the directory they were created in.
|
|
|
|
|
*/
|
|
|
|
|
parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
|
|
|
|
|
BTRFS_I(dir)->root);
|
|
|
|
|
if (IS_ERR(parent)) {
|
|
|
|
|
ret = PTR_ERR(parent);
|
|
|
|
|
} else {
|
|
|
|
|
ret = btrfs_inode_inherit_props(trans, inode, parent);
|
|
|
|
|
iput(parent);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
ret = btrfs_inode_inherit_props(trans, inode, dir);
|
|
|
|
|
}
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
|
"error inheriting props for ino %llu (root %llu): %d",
|
|
|
|
|
btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
|
|
|
|
|
ret);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Subvolumes don't inherit ACLs or get passed to the LSM. This is
|
|
|
|
|
* probably a bug.
|
|
|
|
|
*/
|
|
|
|
|
if (!args->subvol) {
|
|
|
|
|
ret = btrfs_init_inode_security(trans, args);
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto discard;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
|
|
|
inode_tree_add(inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
|
|
|
|
trace_btrfs_inode_new(inode);
|
2020-06-05 10:41:13 +03:00
|
|
|
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2012-07-25 17:35:53 +02:00
|
|
|
btrfs_update_root_times(trans, root);
|
|
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (args->orphan) {
|
|
|
|
|
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
|
|
|
|
} else {
|
|
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
|
|
|
|
|
name_len, 0, BTRFS_I(inode)->dir_index);
|
|
|
|
|
}
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
|
goto discard;
|
|
|
|
|
}
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
|
2022-05-31 16:06:33 +01:00
|
|
|
return 0;
|
2014-09-08 13:08:51 -07:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
discard:
|
2022-03-14 18:12:32 -07:00
|
|
|
/*
|
|
|
|
|
* discard_new_inode() calls iput(), but the caller owns the reference
|
|
|
|
|
* to the inode.
|
|
|
|
|
*/
|
|
|
|
|
ihold(inode);
|
2018-05-16 12:20:05 -04:00
|
|
|
discard_new_inode(inode);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
out:
|
2007-10-15 16:14:19 -04:00
|
|
|
btrfs_free_path(path);
|
2022-03-14 18:12:32 -07:00
|
|
|
return ret;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* utility function to add 'inode' into 'parent_inode' with
|
|
|
|
|
* a give name and a given sequence number.
|
|
|
|
|
* if 'add_backref' is true, also insert a backref from the
|
|
|
|
|
* inode to the parent directory.
|
|
|
|
|
*/
|
2008-09-05 16:13:11 -04:00
|
|
|
int btrfs_add_link(struct btrfs_trans_handle *trans,
|
2017-02-20 13:51:08 +02:00
|
|
|
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
|
2008-09-05 16:13:11 -04:00
|
|
|
const char *name, int name_len, int add_backref, u64 index)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2009-09-21 15:56:00 -04:00
|
|
|
int ret = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_key key;
|
2017-02-20 13:51:08 +02:00
|
|
|
struct btrfs_root *root = parent_inode->root;
|
|
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
|
u64 parent_ino = btrfs_ino(parent_inode);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2017-02-20 13:51:08 +02:00
|
|
|
memcpy(&key, &inode->root->root_key, sizeof(key));
|
2009-09-21 15:56:00 -04:00
|
|
|
} else {
|
2011-04-20 10:31:50 +08:00
|
|
|
key.objectid = ino;
|
2014-06-04 18:41:45 +02:00
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
2009-09-21 15:56:00 -04:00
|
|
|
key.offset = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2018-08-01 11:32:29 +08:00
|
|
|
ret = btrfs_add_root_ref(trans, key.objectid,
|
2016-06-22 18:54:23 -04:00
|
|
|
root->root_key.objectid, parent_ino,
|
|
|
|
|
index, name, name_len);
|
2009-09-21 15:56:00 -04:00
|
|
|
} else if (add_backref) {
|
2011-04-20 10:31:50 +08:00
|
|
|
ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
|
parent_ino, index);
|
2009-09-21 15:56:00 -04:00
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2012-03-12 16:03:00 +01:00
|
|
|
/* Nothing to clean up yet */
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
2009-09-21 15:56:00 -04:00
|
|
|
|
2018-08-04 21:10:57 +08:00
|
|
|
ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
|
2017-02-20 13:51:08 +02:00
|
|
|
btrfs_inode_type(&inode->vfs_inode), index);
|
2012-12-17 14:26:57 -05:00
|
|
|
if (ret == -EEXIST || ret == -EOVERFLOW)
|
2012-03-12 16:03:00 +01:00
|
|
|
goto fail_dir_item;
|
|
|
|
|
else if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
return ret;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
2012-03-12 16:03:00 +01:00
|
|
|
|
2017-02-20 13:51:08 +02:00
|
|
|
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
|
2012-03-12 16:03:00 +01:00
|
|
|
name_len * 2);
|
2017-02-20 13:51:08 +02:00
|
|
|
inode_inc_iversion(&parent_inode->vfs_inode);
|
2019-05-15 16:02:47 +01:00
|
|
|
/*
|
|
|
|
|
* If we are replaying a log tree, we do not want to update the mtime
|
|
|
|
|
* and ctime of the parent directory with the current time, since the
|
|
|
|
|
* log replay procedure is responsible for setting them to their correct
|
|
|
|
|
* values (the ones it had when the fsync was done).
|
|
|
|
|
*/
|
|
|
|
|
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
|
|
|
|
|
struct timespec64 now = current_time(&parent_inode->vfs_inode);
|
|
|
|
|
|
|
|
|
|
parent_inode->vfs_inode.i_mtime = now;
|
|
|
|
|
parent_inode->vfs_inode.i_ctime = now;
|
|
|
|
|
}
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, parent_inode);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret)
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2007-06-12 06:35:45 -04:00
|
|
|
return ret;
|
2012-02-20 08:40:56 -05:00
|
|
|
|
|
|
|
|
fail_dir_item:
|
|
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
|
|
|
|
u64 local_index;
|
|
|
|
|
int err;
|
2018-08-01 11:32:28 +08:00
|
|
|
err = btrfs_del_root_ref(trans, key.objectid,
|
2016-06-22 18:54:23 -04:00
|
|
|
root->root_key.objectid, parent_ino,
|
|
|
|
|
&local_index, name, name_len);
|
2018-12-12 15:14:17 +01:00
|
|
|
if (err)
|
|
|
|
|
btrfs_abort_transaction(trans, err);
|
2012-02-20 08:40:56 -05:00
|
|
|
} else if (add_backref) {
|
|
|
|
|
u64 local_index;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
err = btrfs_del_inode_ref(trans, root, name, name_len,
|
|
|
|
|
ino, parent_ino, &local_index);
|
2018-12-12 15:14:17 +01:00
|
|
|
if (err)
|
|
|
|
|
btrfs_abort_transaction(trans, err);
|
2012-02-20 08:40:56 -05:00
|
|
|
}
|
2018-12-12 15:14:17 +01:00
|
|
|
|
|
|
|
|
/* Return the original error code */
|
2012-02-20 08:40:56 -05:00
|
|
|
return ret;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2022-03-14 18:12:33 -07:00
|
|
|
static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
|
|
|
|
|
struct inode *inode)
|
2007-07-11 10:18:17 -04:00
|
|
|
{
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2007-07-11 10:18:17 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2022-03-14 18:12:34 -07:00
|
|
|
struct btrfs_new_inode_args new_inode_args = {
|
|
|
|
|
.dir = dir,
|
|
|
|
|
.dentry = dentry,
|
|
|
|
|
.inode = inode,
|
|
|
|
|
};
|
|
|
|
|
unsigned int trans_num_items;
|
2022-03-14 18:12:33 -07:00
|
|
|
struct btrfs_trans_handle *trans;
|
2007-07-11 10:18:17 -04:00
|
|
|
int err;
|
|
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (err)
|
|
|
|
|
goto out_inode;
|
2022-03-14 18:12:34 -07:00
|
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, trans_num_items);
|
2022-03-14 18:12:32 -07:00
|
|
|
if (IS_ERR(trans)) {
|
2022-03-14 18:12:34 -07:00
|
|
|
err = PTR_ERR(trans);
|
|
|
|
|
goto out_new_inode_args;
|
2022-03-14 18:12:32 -07:00
|
|
|
}
|
2007-12-21 16:27:21 -05:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
err = btrfs_create_new_inode(trans, &new_inode_args);
|
|
|
|
|
if (!err)
|
|
|
|
|
d_instantiate_new(dentry, inode);
|
2014-09-08 13:08:51 -07:00
|
|
|
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2022-03-14 18:12:33 -07:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2022-03-14 18:12:34 -07:00
|
|
|
out_new_inode_args:
|
|
|
|
|
btrfs_new_inode_args_destroy(&new_inode_args);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
out_inode:
|
|
|
|
|
if (err)
|
|
|
|
|
iput(inode);
|
2007-07-11 10:18:17 -04:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-14 18:12:33 -07:00
|
|
|
static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
|
struct dentry *dentry, umode_t mode, dev_t rdev)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (!inode)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
inode_init_owner(mnt_userns, inode, dir, mode);
|
|
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
|
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
|
return btrfs_create_common(dir, dentry, inode);
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
|
struct dentry *dentry, umode_t mode, bool excl)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2022-03-14 18:12:32 -07:00
|
|
|
struct inode *inode;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (!inode)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
inode_init_owner(mnt_userns, inode, dir, mode);
|
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2022-03-14 18:12:33 -07:00
|
|
|
return btrfs_create_common(dir, dentry, inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
|
|
|
|
|
struct dentry *dentry)
|
|
|
|
|
{
|
2016-01-05 16:24:05 +00:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(old_dentry);
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-08-05 11:18:09 -04:00
|
|
|
u64 index;
|
2007-06-12 06:35:45 -04:00
|
|
|
int err;
|
|
|
|
|
int drop_inode = 0;
|
|
|
|
|
|
2009-11-12 07:14:26 +00:00
|
|
|
/* do not allow sys_link's with other subvols of the same device */
|
2018-08-06 14:25:24 +09:00
|
|
|
if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
|
2011-03-22 17:20:26 +00:00
|
|
|
return -EXDEV;
|
2009-11-12 07:14:26 +00:00
|
|
|
|
2012-08-08 11:32:27 -07:00
|
|
|
if (inode->i_nlink >= BTRFS_LINK_MAX)
|
2011-03-04 17:15:18 +00:00
|
|
|
return -EMLINK;
|
2009-11-12 07:14:26 +00:00
|
|
|
|
2017-02-20 13:50:33 +02:00
|
|
|
err = btrfs_set_inode_index(BTRFS_I(dir), &index);
|
2008-07-24 12:12:38 -04:00
|
|
|
if (err)
|
|
|
|
|
goto fail;
|
|
|
|
|
|
2010-05-16 10:48:46 -04:00
|
|
|
/*
|
2011-02-18 09:21:17 +00:00
|
|
|
* 2 items for inode and inode ref
|
2010-05-16 10:48:46 -04:00
|
|
|
* 2 items for dir items
|
2011-02-18 09:21:17 +00:00
|
|
|
* 1 item for parent inode
|
2018-05-11 13:13:40 -07:00
|
|
|
* 1 item for orphan item deletion if O_TMPFILE
|
2010-05-16 10:48:46 -04:00
|
|
|
*/
|
2018-05-11 13:13:40 -07:00
|
|
|
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
|
2010-05-16 10:48:46 -04:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
err = PTR_ERR(trans);
|
2016-01-05 16:24:05 +00:00
|
|
|
trans = NULL;
|
2010-05-16 10:48:46 -04:00
|
|
|
goto fail;
|
|
|
|
|
}
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
/* There are several dir indexes for this inode, clear the cache. */
|
|
|
|
|
BTRFS_I(inode)->dir_index = 0ULL;
|
2013-10-16 12:10:34 -07:00
|
|
|
inc_nlink(inode);
|
2012-04-05 15:03:02 -04:00
|
|
|
inode_inc_iversion(inode);
|
2016-09-14 07:48:06 -07:00
|
|
|
inode->i_ctime = current_time(inode);
|
2010-10-23 11:11:40 -04:00
|
|
|
ihold(inode);
|
2012-10-11 15:53:56 -04:00
|
|
|
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
|
2008-07-24 12:12:38 -04:00
|
|
|
|
2022-03-09 17:31:34 -08:00
|
|
|
err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
|
|
|
|
|
dentry->d_name.name, dentry->d_name.len, 1, index);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2009-09-24 09:17:31 -04:00
|
|
|
if (err) {
|
2007-06-22 14:16:25 -04:00
|
|
|
drop_inode = 1;
|
2009-09-24 09:17:31 -04:00
|
|
|
} else {
|
2011-07-16 23:09:10 -04:00
|
|
|
struct dentry *parent = dentry->d_parent;
|
Btrfs: sync log after logging new name
When we add a new name for an inode which was logged in the current
transaction, we update the inode in the log so that its new name and
ancestors are added to the log. However when we do this we do not persist
the log, so the changes remain in memory only, and as a consequence, any
ancestors that were created in the current transaction are updated such
that future calls to btrfs_inode_in_log() return true. This leads to a
subsequent fsync against such new ancestor directories returning
immediately, without persisting the log, therefore after a power failure
the new ancestor directories do not exist, despite fsync being called
against them explicitly.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/A
$ mkdir /mnt/B
$ mkdir /mnt/A/C
$ touch /mnt/B/foo
$ xfs_io -c "fsync" /mnt/B/foo
$ ln /mnt/B/foo /mnt/A/C/foo
$ xfs_io -c "fsync" /mnt/A
<power failure>
After the power failure, directory "A" does not exist, despite the explicit
fsync on it.
Instead of fixing this by changing the behaviour of the explicit fsync on
directory "A" to persist the log instead of doing nothing, make the logging
of the new file name (which happens when creating a hard link or renaming)
persist the log. This approach not only is simpler, not requiring addition
of new fields to the inode in memory structure, but also gives us the same
behaviour as ext4, xfs and f2fs (possibly other filesystems too).
A test case for fstests follows soon.
Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
Reported-by: Vijay Chidambaram <vvijay03@gmail.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-06-11 19:24:28 +01:00
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
err = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2012-03-12 16:03:00 +01:00
|
|
|
if (err)
|
|
|
|
|
goto fail;
|
2014-04-27 20:40:45 +01:00
|
|
|
if (inode->i_nlink == 1) {
|
|
|
|
|
/*
|
|
|
|
|
* If new hard link count is 1, it's a file created
|
|
|
|
|
* with open(2) O_TMPFILE flag.
|
|
|
|
|
*/
|
2017-02-20 13:50:58 +02:00
|
|
|
err = btrfs_orphan_del(trans, BTRFS_I(inode));
|
2014-04-27 20:40:45 +01:00
|
|
|
if (err)
|
|
|
|
|
goto fail;
|
|
|
|
|
}
|
2011-12-23 07:58:13 -05:00
|
|
|
d_instantiate(dentry, inode);
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
|
2009-09-24 09:17:31 -04:00
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2007-12-21 16:27:21 -05:00
|
|
|
fail:
|
2016-01-05 16:24:05 +00:00
|
|
|
if (trans)
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (drop_inode) {
|
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
|
iput(inode);
|
|
|
|
|
}
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-06-12 06:35:45 -04:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
|
struct dentry *dentry, umode_t mode)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2022-03-14 18:12:32 -07:00
|
|
|
struct inode *inode;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (!inode)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
|
|
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
2022-03-14 18:12:33 -07:00
|
|
|
return btrfs_create_common(dir, dentry, inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
static noinline int uncompress_inline(struct btrfs_path *path,
|
2015-05-19 23:46:45 +09:00
|
|
|
struct page *page,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
size_t pg_offset, u64 extent_offset,
|
|
|
|
|
struct btrfs_file_extent_item *item)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
|
char *tmp;
|
|
|
|
|
size_t max_size;
|
|
|
|
|
unsigned long inline_size;
|
|
|
|
|
unsigned long ptr;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
|
|
|
|
|
WARN_ON(pg_offset != 0);
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type = btrfs_file_extent_compression(leaf, item);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
max_size = btrfs_file_extent_ram_bytes(leaf, item);
|
2021-10-21 14:58:33 -04:00
|
|
|
inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
tmp = kmalloc(inline_size, GFP_NOFS);
|
2011-04-25 19:43:52 -04:00
|
|
|
if (!tmp)
|
|
|
|
|
return -ENOMEM;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
ptr = btrfs_file_extent_inline_start(item);
|
|
|
|
|
|
|
|
|
|
read_extent_buffer(leaf, tmp, ptr, inline_size);
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
|
2010-12-17 14:21:50 +08:00
|
|
|
ret = btrfs_decompress(compress_type, tmp, page,
|
|
|
|
|
extent_offset, inline_size, max_size);
|
btrfs: add missing memset while reading compressed inline extents
This is a story about 4 distinct (and very old) btrfs bugs.
Commit c8b978188c ("Btrfs: Add zlib compression support") added
three data corruption bugs for inline extents (bugs #1-3).
Commit 93c82d5750 ("Btrfs: zero page past end of inline file items")
fixed bug #1: uncompressed inline extents followed by a hole and more
extents could get non-zero data in the hole as they were read. The fix
was to add a memset in btrfs_get_extent to zero out the hole.
Commit 166ae5a418 ("btrfs: fix inline compressed read err corruption")
fixed bug #2: compressed inline extents which contained non-zero bytes
might be replaced with zero bytes in some cases. This patch removed an
unhelpful memset from uncompress_inline, but the case where memset is
required was missed.
There is also a memset in the decompression code, but this only covers
decompressed data that is shorter than the ram_bytes from the extent
ref record. This memset doesn't cover the region between the end of the
decompressed data and the end of the page. It has also moved around a
few times over the years, so there's no single patch to refer to.
This patch fixes bug #3: compressed inline extents followed by a hole
and more extents could get non-zero data in the hole as they were read
(i.e. bug #3 is the same as bug #1, but s/uncompressed/compressed/).
The fix is the same: zero out the hole in the compressed case too,
by putting a memset back in uncompress_inline, but this time with
correct parameters.
The last and oldest bug, bug #0, is the cause of the offending inline
extent/hole/extent pattern. Bug #0 is a subtle and mostly-harmless quirk
of behavior somewhere in the btrfs write code. In a few special cases,
an inline extent and hole are allowed to persist where they normally
would be combined with later extents in the file.
A fast reproducer for bug #0 is presented below. A few offending extents
are also created in the wild during large rsync transfers with the -S
flag. A Linux kernel build (git checkout; make allyesconfig; make -j8)
will produce a handful of offending files as well. Once an offending
file is created, it can present different content to userspace each
time it is read.
Bug #0 is at least 4 and possibly 8 years old. I verified every vX.Y
kernel back to v3.5 has this behavior. There are fossil records of this
bug's effects in commits all the way back to v2.6.32. I have no reason
to believe bug #0 wasn't present at the beginning of btrfs compression
support in v2.6.29, but I can't easily test kernels that old to be sure.
It is not clear whether bug #0 is worth fixing. A fix would likely
require injecting extra reads into currently write-only paths, and most
of the exceptional cases caused by bug #0 are already handled now.
Whether we like them or not, bug #0's inline extents followed by holes
are part of the btrfs de-facto disk format now, and we need to be able
to read them without data corruption or an infoleak. So enough about
bug #0, let's get back to bug #3 (this patch).
An example of on-disk structure leading to data corruption found in
the wild:
item 61 key (606890 INODE_ITEM 0) itemoff 9662 itemsize 160
inode generation 50 transid 50 size 47424 nbytes 49141
block group 0 mode 100644 links 1 uid 0 gid 0
rdev 0 flags 0x0(none)
item 62 key (606890 INODE_REF 603050) itemoff 9642 itemsize 20
inode ref index 3 namelen 10 name: DB_File.so
item 63 key (606890 EXTENT_DATA 0) itemoff 8280 itemsize 1362
inline extent data size 1341 ram 4085 compress(zlib)
item 64 key (606890 EXTENT_DATA 4096) itemoff 8227 itemsize 53
extent data disk byte 5367308288 nr 20480
extent data offset 0 nr 45056 ram 45056
extent compression(zlib)
Different data appears in userspace during each read of the 11 bytes
between 4085 and 4096. The extent in item 63 is not long enough to
fill the first page of the file, so a memset is required to fill the
space between item 63 (ending at 4085) and item 64 (beginning at 4096)
with zero.
Here is a reproducer from Liu Bo, which demonstrates another method
of creating the same inline extent and hole pattern:
Using 'page_poison=on' kernel command line (or enable
CONFIG_PAGE_POISONING) run the following:
# touch foo
# chattr +c foo
# xfs_io -f -c "pwrite -W 0 1000" foo
# xfs_io -f -c "falloc 4 8188" foo
# od -x foo
# echo 3 >/proc/sys/vm/drop_caches
# od -x foo
This produce the following on my box:
Correct output: file contains 1000 data bytes followed
by zeros:
0000000 cdcd cdcd cdcd cdcd cdcd cdcd cdcd cdcd
*
0001740 cdcd cdcd cdcd cdcd 0000 0000 0000 0000
0001760 0000 0000 0000 0000 0000 0000 0000 0000
*
0020000
Actual output: the data after the first 1000 bytes
will be different each run:
0000000 cdcd cdcd cdcd cdcd cdcd cdcd cdcd cdcd
*
0001740 cdcd cdcd cdcd cdcd 6c63 7400 635f 006d
0001760 5f74 6f43 7400 435f 0053 5f74 7363 7400
0002000 435f 0056 5f74 6164 7400 645f 0062 5f74
(...)
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Chris Mason <clm@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2017-03-10 16:45:44 -05:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* decompression code contains a memset to fill in any space between the end
|
|
|
|
|
* of the uncompressed data and the end of max_size in case the decompressed
|
|
|
|
|
* data ends up shorter than ram_bytes. That doesn't cover the hole between
|
|
|
|
|
* the end of an inline extent and the beginning of the next block, so we
|
|
|
|
|
* cover that region here.
|
|
|
|
|
*/
|
|
|
|
|
|
btrfs: use memzero_page() instead of open coded kmap pattern
There are many places where kmap/memset/kunmap patterns occur.
Use the newly lifted memzero_page() to eliminate direct uses of kmap and
leverage the new core functions use of kmap_local_page().
The development of this patch was aided by the following coccinelle
script:
// <smpl>
// SPDX-License-Identifier: GPL-2.0-only
// Find kmap/memset/kunmap pattern and replace with memset*page calls
//
// NOTE: Offsets and other expressions may be more complex than what the script
// will automatically generate. Therefore a catchall rule is provided to find
// the pattern which then must be evaluated by hand.
//
// Confidence: Low
// Copyright: (C) 2021 Intel Corporation
// URL: http://coccinelle.lip6.fr/
// Comments:
// Options:
//
// Then the memset pattern
//
@ memset_rule1 @
expression page, V, L, Off;
identifier ptr;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
-memset(ptr, 0, L);
+memzero_page(page, 0, L);
|
-memset(ptr + Off, 0, L);
+memzero_page(page, Off, L);
|
-memset(ptr, V, L);
+memset_page(page, V, 0, L);
|
-memset(ptr + Off, V, L);
+memset_page(page, V, Off, L);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule1
@
identifier memset_rule1.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
//
// Catch all
//
@ memset_rule2 @
expression page;
identifier ptr;
expression GenTo, GenSize, GenValue;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
//
// Some call sites have complex expressions within the memset/memcpy
// The follow are catch alls which need to be evaluated by hand.
//
-memset(GenTo, 0, GenSize);
+memzero_pageExtra(page, GenTo, GenSize);
|
-memset(GenTo, GenValue, GenSize);
+memset_pageExtra(page, GenValue, GenTo, GenSize);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule2
@
identifier memset_rule2.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
// </smpl>
Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-04 18:40:07 -07:00
|
|
|
if (max_size + pg_offset < PAGE_SIZE)
|
|
|
|
|
memzero_page(page, pg_offset + max_size,
|
|
|
|
|
PAGE_SIZE - max_size - pg_offset);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
kfree(tmp);
|
2014-05-09 17:15:10 -04:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
|
|
|
|
|
2019-12-02 17:34:23 -08:00
|
|
|
/**
|
|
|
|
|
* btrfs_get_extent - Lookup the first extent overlapping a range in a file.
|
|
|
|
|
* @inode: file to search in
|
|
|
|
|
* @page: page to read extent data into if the extent is inline
|
|
|
|
|
* @pg_offset: offset into @page to copy to
|
|
|
|
|
* @start: file offset
|
|
|
|
|
* @len: length of range starting at @start
|
|
|
|
|
*
|
|
|
|
|
* This returns the first &struct extent_map which overlaps with the given
|
|
|
|
|
* range, reading it from the B-tree and caching it if necessary. Note that
|
|
|
|
|
* there may be more extents which overlap the given range after the returned
|
|
|
|
|
* extent_map.
|
2008-09-29 15:18:18 -04:00
|
|
|
*
|
2019-12-02 17:34:23 -08:00
|
|
|
* If @page is not NULL and the extent is inline, this also reads the extent
|
|
|
|
|
* data directly into the page and marks the extent up to date in the io_tree.
|
|
|
|
|
*
|
|
|
|
|
* Return: ERR_PTR on error, non-NULL extent_map on success.
|
2008-09-29 15:18:18 -04:00
|
|
|
*/
|
2017-02-20 13:51:06 +02:00
|
|
|
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
|
2019-12-02 17:34:23 -08:00
|
|
|
struct page *page, size_t pg_offset,
|
|
|
|
|
u64 start, u64 len)
|
2007-08-27 16:49:44 -04:00
|
|
|
{
|
2018-06-29 10:56:42 +02:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
2020-08-03 12:58:46 +03:00
|
|
|
int ret = 0;
|
2007-08-27 16:49:44 -04:00
|
|
|
u64 extent_start = 0;
|
|
|
|
|
u64 extent_end = 0;
|
2017-02-20 13:51:06 +02:00
|
|
|
u64 objectid = btrfs_ino(inode);
|
2019-05-01 12:19:20 -07:00
|
|
|
int extent_type = -1;
|
2008-07-22 11:18:09 -04:00
|
|
|
struct btrfs_path *path = NULL;
|
2017-02-20 13:51:06 +02:00
|
|
|
struct btrfs_root *root = inode->root;
|
2007-08-27 16:49:44 -04:00
|
|
|
struct btrfs_file_extent_item *item;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct btrfs_key found_key;
|
2007-08-27 16:49:44 -04:00
|
|
|
struct extent_map *em = NULL;
|
2017-02-20 13:51:06 +02:00
|
|
|
struct extent_map_tree *em_tree = &inode->extent_tree;
|
2007-08-27 16:49:44 -04:00
|
|
|
|
2009-09-02 16:24:52 -04:00
|
|
|
read_lock(&em_tree->lock);
|
2008-01-24 16:13:08 -05:00
|
|
|
em = lookup_extent_mapping(em_tree, start, len);
|
2009-09-02 16:24:52 -04:00
|
|
|
read_unlock(&em_tree->lock);
|
2008-01-24 16:13:08 -05:00
|
|
|
|
2007-08-27 16:49:44 -04:00
|
|
|
if (em) {
|
2008-04-22 13:26:46 -04:00
|
|
|
if (em->start > start || em->start + em->len <= start)
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
else if (em->block_start == EXTENT_MAP_INLINE && page)
|
2008-01-29 09:59:12 -05:00
|
|
|
free_extent_map(em);
|
|
|
|
|
else
|
|
|
|
|
goto out;
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
2011-04-21 00:48:27 +02:00
|
|
|
em = alloc_extent_map();
|
2007-08-27 16:49:44 -04:00
|
|
|
if (!em) {
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = -ENOMEM;
|
2008-01-24 16:13:08 -05:00
|
|
|
goto out;
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
2008-01-24 16:13:08 -05:00
|
|
|
em->start = EXTENT_MAP_HOLE;
|
2008-11-10 11:53:33 -05:00
|
|
|
em->orig_start = EXTENT_MAP_HOLE;
|
2008-01-24 16:13:08 -05:00
|
|
|
em->len = (u64)-1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
em->block_len = (u64)-1;
|
2008-07-22 11:18:09 -04:00
|
|
|
|
2018-08-17 05:05:28 +08:00
|
|
|
path = btrfs_alloc_path();
|
2008-07-22 11:18:09 -04:00
|
|
|
if (!path) {
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = -ENOMEM;
|
2018-08-17 05:05:28 +08:00
|
|
|
goto out;
|
2008-07-22 11:18:09 -04:00
|
|
|
}
|
|
|
|
|
|
2018-08-17 05:05:28 +08:00
|
|
|
/* Chances are we'll be called again, so go ahead and do readahead */
|
|
|
|
|
path->reada = READA_FORWARD;
|
2020-10-23 09:58:09 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The same explanation in load_free_space_cache applies here as well,
|
|
|
|
|
* we only read when we're loading the free space cache, and at that
|
|
|
|
|
* point the commit_root has everything we need.
|
|
|
|
|
*/
|
|
|
|
|
if (btrfs_is_free_space_inode(inode)) {
|
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
|
path->skip_locking = 1;
|
|
|
|
|
}
|
2020-08-20 11:46:01 -04:00
|
|
|
|
2017-12-01 11:19:40 +02:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
|
2007-08-27 16:49:44 -04:00
|
|
|
if (ret < 0) {
|
|
|
|
|
goto out;
|
2018-12-17 11:49:00 +02:00
|
|
|
} else if (ret > 0) {
|
2007-08-27 16:49:44 -04:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
|
goto not_found;
|
|
|
|
|
path->slots[0]--;
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = 0;
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
|
|
|
|
|
2007-10-15 16:14:19 -04:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0],
|
2007-08-27 16:49:44 -04:00
|
|
|
struct btrfs_file_extent_item);
|
2007-10-15 16:14:19 -04:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
if (found_key.objectid != objectid ||
|
2018-12-17 10:35:59 +02:00
|
|
|
found_key.type != BTRFS_EXTENT_DATA_KEY) {
|
2013-10-14 12:08:38 -04:00
|
|
|
/*
|
|
|
|
|
* If we backup past the first extent we want to move forward
|
|
|
|
|
* and see if there is an extent in front of us, otherwise we'll
|
|
|
|
|
* say there is a hole for our whole search range which can
|
|
|
|
|
* cause problems.
|
|
|
|
|
*/
|
|
|
|
|
extent_end = start;
|
|
|
|
|
goto next;
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
|
|
|
|
|
2018-12-17 10:35:59 +02:00
|
|
|
extent_type = btrfs_file_extent_type(leaf, item);
|
2007-10-15 16:14:19 -04:00
|
|
|
extent_start = found_key.offset;
|
2020-03-09 12:41:06 +00:00
|
|
|
extent_end = btrfs_file_extent_end(path);
|
2018-12-17 10:35:59 +02:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
|
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2019-03-13 13:55:11 +08:00
|
|
|
/* Only regular file could have regular/prealloc extent */
|
|
|
|
|
if (!S_ISREG(inode->vfs_inode.i_mode)) {
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = -EUCLEAN;
|
2019-03-13 13:55:11 +08:00
|
|
|
btrfs_crit(fs_info,
|
|
|
|
|
"regular/prealloc extent found for non-regular inode %llu",
|
|
|
|
|
btrfs_ino(inode));
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2017-03-10 11:09:48 -08:00
|
|
|
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
|
|
|
|
|
extent_start);
|
2018-12-17 10:35:59 +02:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
2017-03-10 11:09:48 -08:00
|
|
|
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
|
|
|
|
|
path->slots[0],
|
|
|
|
|
extent_start);
|
2008-10-30 14:19:41 -04:00
|
|
|
}
|
2013-10-14 12:08:38 -04:00
|
|
|
next:
|
2008-10-30 14:19:41 -04:00
|
|
|
if (start >= extent_end) {
|
|
|
|
|
path->slots[0]++;
|
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
2020-08-03 12:58:46 +03:00
|
|
|
if (ret < 0)
|
2008-10-30 14:19:41 -04:00
|
|
|
goto out;
|
2020-08-03 12:58:46 +03:00
|
|
|
else if (ret > 0)
|
2008-10-30 14:19:41 -04:00
|
|
|
goto not_found;
|
2020-08-03 12:58:46 +03:00
|
|
|
|
2008-10-30 14:19:41 -04:00
|
|
|
leaf = path->nodes[0];
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
2008-10-30 14:19:41 -04:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
if (found_key.objectid != objectid ||
|
|
|
|
|
found_key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
|
goto not_found;
|
|
|
|
|
if (start + len <= found_key.offset)
|
|
|
|
|
goto not_found;
|
2014-07-17 11:44:14 +08:00
|
|
|
if (start > found_key.offset)
|
|
|
|
|
goto next;
|
2018-12-17 10:36:02 +02:00
|
|
|
|
|
|
|
|
/* New extent overlaps with existing one */
|
2008-10-30 14:19:41 -04:00
|
|
|
em->start = start;
|
2012-10-11 16:54:30 -04:00
|
|
|
em->orig_start = start;
|
2008-10-30 14:19:41 -04:00
|
|
|
em->len = found_key.offset - start;
|
2018-12-17 10:36:02 +02:00
|
|
|
em->block_start = EXTENT_MAP_HOLE;
|
|
|
|
|
goto insert;
|
2008-10-30 14:19:41 -04:00
|
|
|
}
|
|
|
|
|
|
2019-12-02 17:34:23 -08:00
|
|
|
btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
|
2014-06-09 03:48:05 +01:00
|
|
|
|
2018-12-17 10:35:59 +02:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
|
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2007-08-27 16:49:44 -04:00
|
|
|
goto insert;
|
2018-12-17 10:35:59 +02:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-15 16:14:19 -04:00
|
|
|
unsigned long ptr;
|
2007-08-27 16:49:44 -04:00
|
|
|
char *map;
|
2007-10-15 16:18:25 -04:00
|
|
|
size_t size;
|
|
|
|
|
size_t extent_offset;
|
|
|
|
|
size_t copy_size;
|
2007-08-27 16:49:44 -04:00
|
|
|
|
2019-12-02 17:34:23 -08:00
|
|
|
if (!page)
|
2007-10-29 11:41:07 -04:00
|
|
|
goto out;
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2018-06-06 15:41:49 +08:00
|
|
|
size = btrfs_file_extent_ram_bytes(leaf, item);
|
2008-10-30 14:19:41 -04:00
|
|
|
extent_offset = page_offset(page) + pg_offset - extent_start;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
copy_size = min_t(u64, PAGE_SIZE - pg_offset,
|
|
|
|
|
size - extent_offset);
|
2007-10-15 16:18:25 -04:00
|
|
|
em->start = extent_start + extent_offset;
|
2016-06-22 18:54:23 -04:00
|
|
|
em->len = ALIGN(copy_size, fs_info->sectorsize);
|
2012-12-03 10:31:19 -05:00
|
|
|
em->orig_block_len = em->len;
|
2012-10-11 16:54:30 -04:00
|
|
|
em->orig_start = em->start;
|
2007-10-29 11:41:07 -04:00
|
|
|
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
|
2018-08-25 13:47:09 +08:00
|
|
|
|
2017-11-20 13:24:49 -07:00
|
|
|
if (!PageUptodate(page)) {
|
2010-12-17 14:21:50 +08:00
|
|
|
if (btrfs_file_extent_compression(leaf, item) !=
|
|
|
|
|
BTRFS_COMPRESS_NONE) {
|
2015-05-19 23:46:45 +09:00
|
|
|
ret = uncompress_inline(path, page, pg_offset,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
extent_offset, item);
|
2020-08-03 12:58:46 +03:00
|
|
|
if (ret)
|
2014-05-09 17:15:10 -04:00
|
|
|
goto out;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
} else {
|
2021-02-16 18:48:23 -08:00
|
|
|
map = kmap_local_page(page);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
read_extent_buffer(leaf, map + pg_offset, ptr,
|
|
|
|
|
copy_size);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
if (pg_offset + copy_size < PAGE_SIZE) {
|
2009-09-11 12:36:29 -04:00
|
|
|
memset(map + pg_offset + copy_size, 0,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
PAGE_SIZE - pg_offset -
|
2009-09-11 12:36:29 -04:00
|
|
|
copy_size);
|
|
|
|
|
}
|
2021-02-16 18:48:23 -08:00
|
|
|
kunmap_local(map);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
}
|
2007-11-01 11:28:41 -04:00
|
|
|
flush_dcache_page(page);
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
|
|
|
|
goto insert;
|
|
|
|
|
}
|
|
|
|
|
not_found:
|
|
|
|
|
em->start = start;
|
2012-10-11 16:54:30 -04:00
|
|
|
em->orig_start = start;
|
2008-01-24 16:13:08 -05:00
|
|
|
em->len = len;
|
2007-10-15 16:14:19 -04:00
|
|
|
em->block_start = EXTENT_MAP_HOLE;
|
2007-08-27 16:49:44 -04:00
|
|
|
insert:
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = 0;
|
2011-04-21 01:20:15 +02:00
|
|
|
btrfs_release_path(path);
|
2008-01-24 16:13:08 -05:00
|
|
|
if (em->start > start || extent_map_end(em) <= start) {
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 10:05:00 -04:00
|
|
|
"bad extent! em: [%llu %llu] passed [%llu %llu]",
|
|
|
|
|
em->start, em->len, start, len);
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = -EIO;
|
2007-08-27 16:49:44 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
2008-01-24 16:13:08 -05:00
|
|
|
|
2009-09-02 16:24:52 -04:00
|
|
|
write_lock(&em_tree->lock);
|
2020-08-03 12:58:46 +03:00
|
|
|
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
|
2009-09-02 16:24:52 -04:00
|
|
|
write_unlock(&em_tree->lock);
|
2007-08-27 16:49:44 -04:00
|
|
|
out:
|
2018-08-23 07:36:17 +08:00
|
|
|
btrfs_free_path(path);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2017-02-20 13:51:06 +02:00
|
|
|
trace_btrfs_get_extent(root, inode, em);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2020-08-03 12:58:46 +03:00
|
|
|
if (ret) {
|
2007-08-27 16:49:44 -04:00
|
|
|
free_extent_map(em);
|
2020-08-03 12:58:46 +03:00
|
|
|
return ERR_PTR(ret);
|
2007-08-27 16:49:44 -04:00
|
|
|
}
|
|
|
|
|
return em;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:31 +03:00
|
|
|
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
|
2016-05-12 13:53:36 +01:00
|
|
|
const u64 start,
|
|
|
|
|
const u64 len,
|
|
|
|
|
const u64 orig_start,
|
|
|
|
|
const u64 block_start,
|
|
|
|
|
const u64 block_len,
|
|
|
|
|
const u64 orig_block_len,
|
|
|
|
|
const u64 ram_bytes,
|
|
|
|
|
const int type)
|
|
|
|
|
{
|
|
|
|
|
struct extent_map *em = NULL;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
if (type != BTRFS_ORDERED_NOCOW) {
|
2020-06-03 08:55:31 +03:00
|
|
|
em = create_io_em(inode, start, len, orig_start, block_start,
|
|
|
|
|
block_len, orig_block_len, ram_bytes,
|
2017-01-31 07:50:22 -08:00
|
|
|
BTRFS_COMPRESS_NONE, /* compress_type */
|
|
|
|
|
type);
|
2016-05-12 13:53:36 +01:00
|
|
|
if (IS_ERR(em))
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2019-11-06 12:11:56 -08:00
|
|
|
ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
|
|
|
|
|
block_len, 0,
|
|
|
|
|
(1 << type) |
|
|
|
|
|
(1 << BTRFS_ORDERED_DIRECT),
|
|
|
|
|
BTRFS_COMPRESS_NONE);
|
2016-05-12 13:53:36 +01:00
|
|
|
if (ret) {
|
|
|
|
|
if (em) {
|
|
|
|
|
free_extent_map(em);
|
2020-06-03 08:55:31 +03:00
|
|
|
btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
|
2016-05-12 13:53:36 +01:00
|
|
|
}
|
|
|
|
|
em = ERR_PTR(ret);
|
|
|
|
|
}
|
|
|
|
|
out:
|
|
|
|
|
|
|
|
|
|
return em;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 08:55:32 +03:00
|
|
|
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
|
2010-05-23 11:00:55 -04:00
|
|
|
u64 start, u64 len)
|
|
|
|
|
{
|
2020-06-03 08:55:32 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2012-10-11 16:54:30 -04:00
|
|
|
struct extent_map *em;
|
2010-05-23 11:00:55 -04:00
|
|
|
struct btrfs_key ins;
|
|
|
|
|
u64 alloc_hint;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2020-06-03 08:55:32 +03:00
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, len);
|
2016-06-22 18:54:23 -04:00
|
|
|
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
|
2016-06-15 09:22:56 -04:00
|
|
|
0, alloc_hint, &ins, 1, 1);
|
2013-08-14 14:02:47 -04:00
|
|
|
if (ret)
|
|
|
|
|
return ERR_PTR(ret);
|
2010-05-23 11:00:55 -04:00
|
|
|
|
2020-06-03 08:55:32 +03:00
|
|
|
em = btrfs_create_dio_extent(inode, start, ins.offset, start,
|
2016-05-12 13:53:36 +01:00
|
|
|
ins.objectid, ins.offset, ins.offset,
|
2017-02-21 12:12:58 -08:00
|
|
|
ins.offset, BTRFS_ORDERED_REGULAR);
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2016-05-12 13:53:36 +01:00
|
|
|
if (IS_ERR(em))
|
2020-06-03 08:55:32 +03:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
|
|
|
|
|
1);
|
2016-01-21 10:17:54 +00:00
|
|
|
|
2010-05-23 11:00:55 -04:00
|
|
|
return em;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-10 21:25:16 -08:00
|
|
|
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
|
2021-02-10 21:25:15 -08:00
|
|
|
{
|
|
|
|
|
struct btrfs_block_group *block_group;
|
2021-02-10 21:25:16 -08:00
|
|
|
bool readonly = false;
|
2021-02-10 21:25:15 -08:00
|
|
|
|
|
|
|
|
block_group = btrfs_lookup_block_group(fs_info, bytenr);
|
|
|
|
|
if (!block_group || block_group->ro)
|
2021-02-10 21:25:16 -08:00
|
|
|
readonly = true;
|
2021-02-10 21:25:15 -08:00
|
|
|
if (block_group)
|
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
|
return readonly;
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-26 11:04:10 -04:00
|
|
|
/*
|
2020-06-24 07:23:51 +08:00
|
|
|
* Check if we can do nocow write into the range [@offset, @offset + @len)
|
|
|
|
|
*
|
|
|
|
|
* @offset: File offset
|
|
|
|
|
* @len: The length to write, will be updated to the nocow writeable
|
|
|
|
|
* range
|
|
|
|
|
* @orig_start: (optional) Return the original file offset of the file extent
|
|
|
|
|
* @orig_len: (optional) Return the original on-disk length of the file extent
|
|
|
|
|
* @ram_bytes: (optional) Return the ram_bytes of the file extent
|
2020-08-18 11:00:05 -07:00
|
|
|
* @strict: if true, omit optimizations that might force us into unnecessary
|
|
|
|
|
* cow. e.g., don't trust generation number.
|
2020-06-24 07:23:51 +08:00
|
|
|
*
|
|
|
|
|
* Return:
|
|
|
|
|
* >0 and update @len if we can do nocow write
|
|
|
|
|
* 0 if we can't do nocow write
|
|
|
|
|
* <0 if error happened
|
|
|
|
|
*
|
|
|
|
|
* NOTE: This only checks the file extents, caller is responsible to wait for
|
|
|
|
|
* any ordered extents.
|
2010-05-26 11:04:10 -04:00
|
|
|
*/
|
2013-08-14 14:02:47 -04:00
|
|
|
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
2013-06-21 16:37:03 -04:00
|
|
|
u64 *orig_start, u64 *orig_block_len,
|
2020-08-18 11:00:05 -07:00
|
|
|
u64 *ram_bytes, bool strict)
|
2010-05-26 11:04:10 -04:00
|
|
|
{
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2022-03-30 15:31:06 +01:00
|
|
|
struct can_nocow_file_extent_args nocow_args = { 0 };
|
2010-05-26 11:04:10 -04:00
|
|
|
struct btrfs_path *path;
|
|
|
|
|
int ret;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2014-02-27 13:58:05 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-05-26 11:04:10 -04:00
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
int found_type;
|
2013-12-27 21:11:50 +08:00
|
|
|
|
2010-05-26 11:04:10 -04:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
2017-01-20 14:54:07 +01:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path,
|
|
|
|
|
btrfs_ino(BTRFS_I(inode)), offset, 0);
|
2010-05-26 11:04:10 -04:00
|
|
|
if (ret < 0)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
if (ret == 1) {
|
2022-03-30 15:31:06 +01:00
|
|
|
if (path->slots[0] == 0) {
|
2010-05-26 11:04:10 -04:00
|
|
|
/* can't find the item, must cow */
|
|
|
|
|
ret = 0;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2022-03-30 15:31:06 +01:00
|
|
|
path->slots[0]--;
|
2010-05-26 11:04:10 -04:00
|
|
|
}
|
|
|
|
|
ret = 0;
|
|
|
|
|
leaf = path->nodes[0];
|
2022-03-30 15:31:06 +01:00
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
2017-01-10 20:35:31 +02:00
|
|
|
if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
|
2010-05-26 11:04:10 -04:00
|
|
|
key.type != BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
|
/* not our file or wrong item type, must cow */
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (key.offset > offset) {
|
|
|
|
|
/* Wrong offset, must cow */
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
if (btrfs_file_extent_end(path) <= offset)
|
2013-06-21 16:37:03 -04:00
|
|
|
goto out;
|
|
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
|
|
|
|
|
found_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
|
if (ram_bytes)
|
|
|
|
|
*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
|
2013-12-27 21:11:50 +08:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
nocow_args.start = offset;
|
|
|
|
|
nocow_args.end = offset + *len - 1;
|
|
|
|
|
nocow_args.strict = strict;
|
|
|
|
|
nocow_args.free_path = true;
|
2013-06-21 16:37:03 -04:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
|
|
|
|
|
/* can_nocow_file_extent() has freed the path. */
|
|
|
|
|
path = NULL;
|
2013-06-21 16:37:03 -04:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
if (ret != 1) {
|
|
|
|
|
/* Treat errors as not being able to NOCOW. */
|
|
|
|
|
ret = 0;
|
2018-05-17 14:58:29 +08:00
|
|
|
goto out;
|
2013-06-21 16:37:03 -04:00
|
|
|
}
|
2013-04-24 16:32:55 -04:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
ret = 0;
|
|
|
|
|
if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
|
2010-05-26 11:04:10 -04:00
|
|
|
goto out;
|
2014-02-27 13:58:05 +08:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2014-02-27 13:58:05 +08:00
|
|
|
u64 range_end;
|
|
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
range_end = round_up(offset + nocow_args.num_bytes,
|
2016-06-15 09:22:56 -04:00
|
|
|
root->fs_info->sectorsize) - 1;
|
2014-02-27 13:58:05 +08:00
|
|
|
ret = test_range_bit(io_tree, offset, range_end,
|
|
|
|
|
EXTENT_DELALLOC, 0, NULL);
|
|
|
|
|
if (ret) {
|
|
|
|
|
ret = -EAGAIN;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
if (orig_start)
|
|
|
|
|
*orig_start = key.offset - nocow_args.extent_offset;
|
|
|
|
|
if (orig_block_len)
|
|
|
|
|
*orig_block_len = nocow_args.disk_num_bytes;
|
2013-08-14 14:02:47 -04:00
|
|
|
|
2022-03-30 15:31:06 +01:00
|
|
|
*len = nocow_args.num_bytes;
|
2010-05-26 11:04:10 -04:00
|
|
|
ret = 1;
|
|
|
|
|
out:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2012-07-31 16:28:48 -04:00
|
|
|
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
|
2022-03-23 16:19:24 +00:00
|
|
|
struct extent_state **cached_state,
|
|
|
|
|
unsigned int iomap_flags)
|
2012-07-31 16:28:48 -04:00
|
|
|
{
|
2022-03-23 16:19:24 +00:00
|
|
|
const bool writing = (iomap_flags & IOMAP_WRITE);
|
|
|
|
|
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
|
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2012-07-31 16:28:48 -04:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
while (1) {
|
2022-03-23 16:19:24 +00:00
|
|
|
if (nowait) {
|
|
|
|
|
if (!try_lock_extent(io_tree, lockstart, lockend))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
} else {
|
|
|
|
|
lock_extent_bits(io_tree, lockstart, lockend, cached_state);
|
|
|
|
|
}
|
2012-07-31 16:28:48 -04:00
|
|
|
/*
|
|
|
|
|
* We're concerned with the entire range that we're going to be
|
2016-05-19 21:18:45 -04:00
|
|
|
* doing DIO to, so we need to make sure there's no ordered
|
2012-07-31 16:28:48 -04:00
|
|
|
* extents in this range.
|
|
|
|
|
*/
|
2017-02-20 13:50:49 +02:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
|
2012-07-31 16:28:48 -04:00
|
|
|
lockend - lockstart + 1);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We need to make sure there are no buffered pages in this
|
|
|
|
|
* range either, we could have raced between the invalidate in
|
|
|
|
|
* generic_file_direct_write and locking the extent. The
|
|
|
|
|
* invalidate needs to happen so that reads after a write do not
|
|
|
|
|
* get stale data.
|
|
|
|
|
*/
|
2014-05-20 13:07:56 -07:00
|
|
|
if (!ordered &&
|
2018-03-07 15:33:22 +01:00
|
|
|
(!writing || !filemap_range_has_page(inode->i_mapping,
|
|
|
|
|
lockstart, lockend)))
|
2012-07-31 16:28:48 -04:00
|
|
|
break;
|
|
|
|
|
|
2022-03-23 16:19:24 +00:00
|
|
|
unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
|
2012-07-31 16:28:48 -04:00
|
|
|
|
|
|
|
|
if (ordered) {
|
2022-03-23 16:19:24 +00:00
|
|
|
if (nowait) {
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
ret = -EAGAIN;
|
|
|
|
|
break;
|
|
|
|
|
}
|
Btrfs: fix deadlock between direct IO reads and buffered writes
While running a test with a mix of buffered IO and direct IO against
the same files I hit a deadlock reported by the following trace:
[11642.140352] INFO: task kworker/u32:3:15282 blocked for more than 120 seconds.
[11642.142452] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.143982] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.146332] kworker/u32:3 D ffff880230ef7988 [11642.147737] systemd-journald[571]: Sent WATCHDOG=1 notification.
[11642.149771] 0 15282 2 0x00000000
[11642.151205] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs]
[11642.154074] ffff880230ef7988 0000000000000246 0000000000014ec0 ffff88023ec94ec0
[11642.156722] ffff880233fe8f80 ffff880230ef8000 ffff88023ec94ec0 7fffffffffffffff
[11642.159205] 0000000000000002 ffffffff8147b7f9 ffff880230ef79a0 ffffffff8147b541
[11642.161403] Call Trace:
[11642.162129] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.163396] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.164871] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.167020] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.167931] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.182320] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.183762] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.185308] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.186782] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.188217] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.189626] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.190803] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.192158] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.193379] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.194831] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.197068] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.199188] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.200723] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.202465] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.203836] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.205624] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.207057] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.208529] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.210375] [<ffffffffa0462613>] ? btrfs_scrubparity_helper+0x140/0x33a [btrfs]
[11642.212132] [<ffffffffa044f974>] btrfs_run_ordered_extent_work+0x25/0x34 [btrfs]
[11642.213837] [<ffffffffa046262f>] btrfs_scrubparity_helper+0x15c/0x33a [btrfs]
[11642.215457] [<ffffffffa046293b>] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
[11642.217095] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.218324] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.219466] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.220801] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.222032] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.223190] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.224394] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.226295] 2 locks held by kworker/u32:3/15282:
[11642.227273] #0: ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.229412] #1: ((&work->normal_work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.231414] INFO: task kworker/u32:8:15289 blocked for more than 120 seconds.
[11642.232872] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.234109] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.235776] kworker/u32:8 D ffff88020de5f848 0 15289 2 0x00000000
[11642.237412] Workqueue: writeback wb_workfn (flush-btrfs-481)
[11642.238670] ffff88020de5f848 0000000000000246 0000000000014ec0 ffff88023ed54ec0
[11642.240475] ffff88021b1ece40 ffff88020de60000 ffff88023ed54ec0 7fffffffffffffff
[11642.242154] 0000000000000002 ffffffff8147b7f9 ffff88020de5f860 ffffffff8147b541
[11642.243715] Call Trace:
[11642.244390] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.245432] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.246392] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.247479] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.248551] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.249968] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.251043] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.252202] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.253210] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.254307] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.256118] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.257131] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.258200] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.259168] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.260516] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.261841] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.263531] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.264747] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.266148] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.267264] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.268280] [<ffffffff81192a2b>] __writeback_single_inode+0xda/0x5ba
[11642.269407] [<ffffffff811939f0>] writeback_sb_inodes+0x27b/0x43d
[11642.270476] [<ffffffff81193c28>] __writeback_inodes_wb+0x76/0xae
[11642.271547] [<ffffffff81193ea6>] wb_writeback+0x19e/0x41c
[11642.272588] [<ffffffff81194821>] wb_workfn+0x201/0x341
[11642.273523] [<ffffffff81194821>] ? wb_workfn+0x201/0x341
[11642.274479] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.275497] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.276518] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.277520] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.278517] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.279371] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.280468] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.281607] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.282604] 3 locks held by kworker/u32:8/15289:
[11642.283423] #0: ("writeback"){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.285629] #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.287538] #2: (&type->s_umount_key#37){+++++.}, at: [<ffffffff81171217>] trylock_super+0x1b/0x4b
[11642.289423] INFO: task fdm-stress:26848 blocked for more than 120 seconds.
[11642.290547] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.291453] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.292864] fdm-stress D ffff88022c107c20 0 26848 26591 0x00000000
[11642.294118] ffff88022c107c20 000000038108affa 0000000000014ec0 ffff88023ed54ec0
[11642.295602] ffff88013ab1ca40 ffff88022c108000 ffff8800b2fc19d0 00000000000e0fff
[11642.297098] ffff8800b2fc19b0 ffff88022c107c88 ffff88022c107c38 ffffffff8147b541
[11642.298433] Call Trace:
[11642.298896] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.299738] [<ffffffffa045225d>] lock_extent_bits+0xfe/0x1a3 [btrfs]
[11642.300833] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.301943] [<ffffffffa0447516>] lock_and_cleanup_extent_if_need+0x68/0x18e [btrfs]
[11642.303270] [<ffffffffa04485ba>] __btrfs_buffered_write+0x238/0x4c1 [btrfs]
[11642.304552] [<ffffffffa044b50a>] ? btrfs_file_write_iter+0x17c/0x408 [btrfs]
[11642.305782] [<ffffffffa044b682>] btrfs_file_write_iter+0x2f4/0x408 [btrfs]
[11642.306878] [<ffffffff8116e298>] __vfs_write+0x7c/0xa5
[11642.307729] [<ffffffff8116e7d1>] vfs_write+0x9d/0xe8
[11642.308602] [<ffffffff8116efbb>] SyS_write+0x50/0x7e
[11642.309410] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.310403] 3 locks held by fdm-stress/26848:
[11642.311108] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.312578] #1: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.314170] #2: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa044b401>] btrfs_file_write_iter+0x73/0x408 [btrfs]
[11642.316796] INFO: task fdm-stress:26849 blocked for more than 120 seconds.
[11642.317842] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.318691] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.319959] fdm-stress D ffff8801964ffa68 0 26849 26591 0x00000000
[11642.321312] ffff8801964ffa68 00ff8801e9975f80 0000000000014ec0 ffff88023ed94ec0
[11642.322555] ffff8800b00b4840 ffff880196500000 ffff8801e9975f20 0000000000000002
[11642.323715] ffff8801e9975f18 ffff8800b00b4840 ffff8801964ffa80 ffffffff8147b541
[11642.325096] Call Trace:
[11642.325532] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.326303] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.327180] [<ffffffff8108ae40>] ? mark_held_locks+0x5e/0x74
[11642.328114] [<ffffffff8147f30e>] ? _raw_spin_unlock_irq+0x2c/0x4a
[11642.329051] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.330053] [<ffffffff8147bceb>] __wait_for_common+0x109/0x147
[11642.330952] [<ffffffff8147bceb>] ? __wait_for_common+0x109/0x147
[11642.331869] [<ffffffff8147e7bb>] ? usleep_range+0x4a/0x4a
[11642.332925] [<ffffffff81074075>] ? wake_up_q+0x47/0x47
[11642.333736] [<ffffffff8147bd4d>] wait_for_completion+0x24/0x26
[11642.334672] [<ffffffffa044f5ce>] btrfs_wait_ordered_extents+0x1c8/0x217 [btrfs]
[11642.335858] [<ffffffffa0465b5a>] btrfs_mksubvol+0x224/0x45d [btrfs]
[11642.336854] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.337820] [<ffffffffa0465edb>] btrfs_ioctl_snap_create_transid+0x148/0x17a [btrfs]
[11642.339026] [<ffffffffa046603b>] btrfs_ioctl_snap_create_v2+0xc7/0x110 [btrfs]
[11642.340214] [<ffffffffa0468582>] btrfs_ioctl+0x590/0x27bd [btrfs]
[11642.341123] [<ffffffff8147dc00>] ? mutex_unlock+0xe/0x10
[11642.341934] [<ffffffffa00fa6e9>] ? ext4_file_write_iter+0x2a3/0x36f [ext4]
[11642.342936] [<ffffffff8108895d>] ? __lock_is_held+0x3c/0x57
[11642.343772] [<ffffffff81186a1d>] ? rcu_read_unlock+0x3e/0x5d
[11642.344673] [<ffffffff8117dc95>] do_vfs_ioctl+0x458/0x4dc
[11642.346024] [<ffffffff81186bbe>] ? __fget_light+0x62/0x71
[11642.346873] [<ffffffff8117dd70>] SyS_ioctl+0x57/0x79
[11642.347720] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.350222] 4 locks held by fdm-stress/26849:
[11642.350898] #0: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.352375] #1: (&type->i_mutex_dir_key#4/1){+.+.+.}, at: [<ffffffffa0465981>] btrfs_mksubvol+0x4b/0x45d [btrfs]
[11642.354072] #2: (&fs_info->subvol_sem){++++..}, at: [<ffffffffa0465a2a>] btrfs_mksubvol+0xf4/0x45d [btrfs]
[11642.355647] #3: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.357516] INFO: task fdm-stress:26850 blocked for more than 120 seconds.
[11642.358508] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.359376] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.368625] fdm-stress D ffff88021f167688 0 26850 26591 0x00000000
[11642.369716] ffff88021f167688 0000000000000001 0000000000014ec0 ffff88023edd4ec0
[11642.370950] ffff880128a98680 ffff88021f168000 ffff88023edd4ec0 7fffffffffffffff
[11642.372210] 0000000000000002 ffffffff8147b7f9 ffff88021f1676a0 ffffffff8147b541
[11642.373430] Call Trace:
[11642.373853] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.374623] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.375948] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.376862] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.377637] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.378610] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.379457] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.380366] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.381353] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.382255] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.383162] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.383945] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.384875] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.385749] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.386721] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.387596] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.389030] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.389973] [<ffffffff810a25ad>] ? rcu_read_lock_sched_held+0x61/0x69
[11642.390939] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.392271] [<ffffffffa0451c32>] ? __clear_extent_bit+0x26e/0x2c0 [btrfs]
[11642.393305] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.394239] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.395045] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.395991] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.397144] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.398392] [<ffffffffa0452094>] ? clear_extent_bit+0x17/0x19 [btrfs]
[11642.399363] [<ffffffffa0445945>] btrfs_get_blocks_direct+0x12b/0x61c [btrfs]
[11642.400445] [<ffffffff8119f7a1>] ? dio_bio_add_page+0x3d/0x54
[11642.401309] [<ffffffff8119fa93>] ? submit_page_section+0x7b/0x111
[11642.402213] [<ffffffff811a0258>] do_blockdev_direct_IO+0x685/0xc24
[11642.403139] [<ffffffffa044581a>] ? btrfs_page_exists_in_range+0x1a1/0x1a1 [btrfs]
[11642.404360] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.406187] [<ffffffff811a0828>] __blockdev_direct_IO+0x31/0x33
[11642.407070] [<ffffffff811a0828>] ? __blockdev_direct_IO+0x31/0x33
[11642.407990] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.409192] [<ffffffffa043b4ca>] btrfs_direct_IO+0x1c7/0x27e [btrfs]
[11642.410146] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.411291] [<ffffffff81119a2c>] generic_file_read_iter+0x89/0x4e1
[11642.412263] [<ffffffff8108ac05>] ? mark_lock+0x24/0x201
[11642.413057] [<ffffffff8116e1f8>] __vfs_read+0x79/0x9d
[11642.413897] [<ffffffff8116e6f1>] vfs_read+0x8f/0xd2
[11642.414708] [<ffffffff8116ef3d>] SyS_read+0x50/0x7e
[11642.415573] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.416572] 1 lock held by fdm-stress/26850:
[11642.417345] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.418703] INFO: task fdm-stress:26851 blocked for more than 120 seconds.
[11642.419698] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.420612] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.421807] fdm-stress D ffff880196483d28 0 26851 26591 0x00000000
[11642.422878] ffff880196483d28 00ff8801c8f60740 0000000000014ec0 ffff88023ed94ec0
[11642.424149] ffff8801c8f60740 ffff880196484000 0000000000000246 ffff8801c8f60740
[11642.425374] ffff8801bb711840 ffff8801bb711878 ffff880196483d40 ffffffff8147b541
[11642.426591] Call Trace:
[11642.427013] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.427856] [<ffffffff8147b6d5>] schedule_preempt_disabled+0x18/0x24
[11642.428852] [<ffffffff8147c23a>] mutex_lock_nested+0x1d7/0x3b4
[11642.429743] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.430911] [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.432102] [<ffffffffa044f674>] ? btrfs_wait_ordered_roots+0x57/0x191 [btrfs]
[11642.433259] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.434431] [<ffffffffa044f6ea>] btrfs_wait_ordered_roots+0xcd/0x191 [btrfs]
[11642.436079] [<ffffffffa0410cab>] btrfs_sync_fs+0xe0/0x1ad [btrfs]
[11642.437009] [<ffffffff81197900>] ? SyS_tee+0x23c/0x23c
[11642.437860] [<ffffffff81197920>] sync_fs_one_sb+0x20/0x22
[11642.438723] [<ffffffff81171435>] iterate_supers+0x75/0xc2
[11642.439597] [<ffffffff81197d00>] sys_sync+0x52/0x80
[11642.440454] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.441533] 3 locks held by fdm-stress/26851:
[11642.442370] #0: (&type->s_umount_key#37){+++++.}, at: [<ffffffff8117141f>] iterate_supers+0x5f/0xc2
[11642.444043] #1: (&fs_info->ordered_operations_mutex){+.+...}, at: [<ffffffffa044f661>] btrfs_wait_ordered_roots+0x44/0x191 [btrfs]
[11642.446010] #2: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
This happened because under specific timings the path for direct IO reads
can deadlock with concurrent buffered writes. The diagram below shows how
this happens for an example file that has the following layout:
[ extent A ] [ extent B ] [ ....
0K 4K 8K
CPU 1 CPU 2 CPU 3
DIO read against range
[0K, 8K[ starts
btrfs_direct_IO()
--> calls btrfs_get_blocks_direct()
which finds the extent map for the
extent A and leaves the range
[0K, 4K[ locked in the inode's
io tree
buffered write against
range [4K, 8K[ starts
__btrfs_buffered_write()
--> dirties page at 4K
a user space
task calls sync
for e.g or
writepages() is
invoked by mm
writepages()
run_delalloc_range()
cow_file_range()
--> ordered extent X
for the buffered
write is created
and
writeback starts
--> calls btrfs_get_blocks_direct()
again, without submitting first
a bio for reading extent A, and
finds the extent map for extent B
--> calls lock_extent_direct()
--> locks range [4K, 8K[
--> finds ordered extent X
covering range [4K, 8K[
--> unlocks range [4K, 8K[
buffered write against
range [0K, 8K[ starts
__btrfs_buffered_write()
prepare_pages()
--> locks pages with
offsets 0 and 4K
lock_and_cleanup_extent_if_need()
--> blocks attempting to
lock range [0K, 8K[ in
the inode's io tree,
because the range [0, 4K[
is already locked by the
direct IO task at CPU 1
--> calls
btrfs_start_ordered_extent(oe X)
btrfs_start_ordered_extent(oe X)
--> At this point writeback for ordered
extent X has not finished yet
filemap_fdatawrite_range()
btrfs_writepages()
extent_writepages()
extent_write_cache_pages()
--> finds page with offset 0
with the writeback tag
(and not dirty)
--> tries to lock it
--> deadlock, task at CPU 2
has the page locked and
is blocked on the io range
[0, 4K[ that was locked
earlier by this task
So fix this by falling back to a buffered read in the direct IO read path
when an ordered extent for a buffered write is found.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-18 14:28:55 +00:00
|
|
|
/*
|
|
|
|
|
* If we are doing a DIO read and the ordered extent we
|
|
|
|
|
* found is for a buffered write, we can not wait for it
|
|
|
|
|
* to complete and retry, because if we do so we can
|
|
|
|
|
* deadlock with concurrent buffered writes on page
|
|
|
|
|
* locks. This happens only if our DIO read covers more
|
|
|
|
|
* than one extent map, if at this point has already
|
|
|
|
|
* created an ordered extent for a previous extent map
|
|
|
|
|
* and locked its range in the inode's io tree, and a
|
|
|
|
|
* concurrent write against that previous extent map's
|
|
|
|
|
* range and this range started (we unlock the ranges
|
|
|
|
|
* in the io tree only when the bios complete and
|
|
|
|
|
* buffered writes always lock pages before attempting
|
|
|
|
|
* to lock range in the io tree).
|
|
|
|
|
*/
|
|
|
|
|
if (writing ||
|
|
|
|
|
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
|
2020-09-18 12:15:53 +03:00
|
|
|
btrfs_start_ordered_extent(ordered, 1);
|
Btrfs: fix deadlock between direct IO reads and buffered writes
While running a test with a mix of buffered IO and direct IO against
the same files I hit a deadlock reported by the following trace:
[11642.140352] INFO: task kworker/u32:3:15282 blocked for more than 120 seconds.
[11642.142452] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.143982] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.146332] kworker/u32:3 D ffff880230ef7988 [11642.147737] systemd-journald[571]: Sent WATCHDOG=1 notification.
[11642.149771] 0 15282 2 0x00000000
[11642.151205] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs]
[11642.154074] ffff880230ef7988 0000000000000246 0000000000014ec0 ffff88023ec94ec0
[11642.156722] ffff880233fe8f80 ffff880230ef8000 ffff88023ec94ec0 7fffffffffffffff
[11642.159205] 0000000000000002 ffffffff8147b7f9 ffff880230ef79a0 ffffffff8147b541
[11642.161403] Call Trace:
[11642.162129] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.163396] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.164871] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.167020] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.167931] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.182320] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.183762] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.185308] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.186782] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.188217] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.189626] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.190803] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.192158] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.193379] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.194831] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.197068] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.199188] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.200723] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.202465] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.203836] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.205624] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.207057] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.208529] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.210375] [<ffffffffa0462613>] ? btrfs_scrubparity_helper+0x140/0x33a [btrfs]
[11642.212132] [<ffffffffa044f974>] btrfs_run_ordered_extent_work+0x25/0x34 [btrfs]
[11642.213837] [<ffffffffa046262f>] btrfs_scrubparity_helper+0x15c/0x33a [btrfs]
[11642.215457] [<ffffffffa046293b>] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
[11642.217095] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.218324] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.219466] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.220801] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.222032] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.223190] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.224394] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.226295] 2 locks held by kworker/u32:3/15282:
[11642.227273] #0: ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.229412] #1: ((&work->normal_work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.231414] INFO: task kworker/u32:8:15289 blocked for more than 120 seconds.
[11642.232872] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.234109] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.235776] kworker/u32:8 D ffff88020de5f848 0 15289 2 0x00000000
[11642.237412] Workqueue: writeback wb_workfn (flush-btrfs-481)
[11642.238670] ffff88020de5f848 0000000000000246 0000000000014ec0 ffff88023ed54ec0
[11642.240475] ffff88021b1ece40 ffff88020de60000 ffff88023ed54ec0 7fffffffffffffff
[11642.242154] 0000000000000002 ffffffff8147b7f9 ffff88020de5f860 ffffffff8147b541
[11642.243715] Call Trace:
[11642.244390] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.245432] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.246392] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.247479] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.248551] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.249968] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.251043] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.252202] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.253210] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.254307] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.256118] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.257131] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.258200] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.259168] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.260516] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.261841] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.263531] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.264747] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.266148] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.267264] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.268280] [<ffffffff81192a2b>] __writeback_single_inode+0xda/0x5ba
[11642.269407] [<ffffffff811939f0>] writeback_sb_inodes+0x27b/0x43d
[11642.270476] [<ffffffff81193c28>] __writeback_inodes_wb+0x76/0xae
[11642.271547] [<ffffffff81193ea6>] wb_writeback+0x19e/0x41c
[11642.272588] [<ffffffff81194821>] wb_workfn+0x201/0x341
[11642.273523] [<ffffffff81194821>] ? wb_workfn+0x201/0x341
[11642.274479] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.275497] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.276518] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.277520] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.278517] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.279371] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.280468] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.281607] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.282604] 3 locks held by kworker/u32:8/15289:
[11642.283423] #0: ("writeback"){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.285629] #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.287538] #2: (&type->s_umount_key#37){+++++.}, at: [<ffffffff81171217>] trylock_super+0x1b/0x4b
[11642.289423] INFO: task fdm-stress:26848 blocked for more than 120 seconds.
[11642.290547] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.291453] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.292864] fdm-stress D ffff88022c107c20 0 26848 26591 0x00000000
[11642.294118] ffff88022c107c20 000000038108affa 0000000000014ec0 ffff88023ed54ec0
[11642.295602] ffff88013ab1ca40 ffff88022c108000 ffff8800b2fc19d0 00000000000e0fff
[11642.297098] ffff8800b2fc19b0 ffff88022c107c88 ffff88022c107c38 ffffffff8147b541
[11642.298433] Call Trace:
[11642.298896] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.299738] [<ffffffffa045225d>] lock_extent_bits+0xfe/0x1a3 [btrfs]
[11642.300833] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.301943] [<ffffffffa0447516>] lock_and_cleanup_extent_if_need+0x68/0x18e [btrfs]
[11642.303270] [<ffffffffa04485ba>] __btrfs_buffered_write+0x238/0x4c1 [btrfs]
[11642.304552] [<ffffffffa044b50a>] ? btrfs_file_write_iter+0x17c/0x408 [btrfs]
[11642.305782] [<ffffffffa044b682>] btrfs_file_write_iter+0x2f4/0x408 [btrfs]
[11642.306878] [<ffffffff8116e298>] __vfs_write+0x7c/0xa5
[11642.307729] [<ffffffff8116e7d1>] vfs_write+0x9d/0xe8
[11642.308602] [<ffffffff8116efbb>] SyS_write+0x50/0x7e
[11642.309410] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.310403] 3 locks held by fdm-stress/26848:
[11642.311108] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.312578] #1: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.314170] #2: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa044b401>] btrfs_file_write_iter+0x73/0x408 [btrfs]
[11642.316796] INFO: task fdm-stress:26849 blocked for more than 120 seconds.
[11642.317842] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.318691] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.319959] fdm-stress D ffff8801964ffa68 0 26849 26591 0x00000000
[11642.321312] ffff8801964ffa68 00ff8801e9975f80 0000000000014ec0 ffff88023ed94ec0
[11642.322555] ffff8800b00b4840 ffff880196500000 ffff8801e9975f20 0000000000000002
[11642.323715] ffff8801e9975f18 ffff8800b00b4840 ffff8801964ffa80 ffffffff8147b541
[11642.325096] Call Trace:
[11642.325532] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.326303] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.327180] [<ffffffff8108ae40>] ? mark_held_locks+0x5e/0x74
[11642.328114] [<ffffffff8147f30e>] ? _raw_spin_unlock_irq+0x2c/0x4a
[11642.329051] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.330053] [<ffffffff8147bceb>] __wait_for_common+0x109/0x147
[11642.330952] [<ffffffff8147bceb>] ? __wait_for_common+0x109/0x147
[11642.331869] [<ffffffff8147e7bb>] ? usleep_range+0x4a/0x4a
[11642.332925] [<ffffffff81074075>] ? wake_up_q+0x47/0x47
[11642.333736] [<ffffffff8147bd4d>] wait_for_completion+0x24/0x26
[11642.334672] [<ffffffffa044f5ce>] btrfs_wait_ordered_extents+0x1c8/0x217 [btrfs]
[11642.335858] [<ffffffffa0465b5a>] btrfs_mksubvol+0x224/0x45d [btrfs]
[11642.336854] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.337820] [<ffffffffa0465edb>] btrfs_ioctl_snap_create_transid+0x148/0x17a [btrfs]
[11642.339026] [<ffffffffa046603b>] btrfs_ioctl_snap_create_v2+0xc7/0x110 [btrfs]
[11642.340214] [<ffffffffa0468582>] btrfs_ioctl+0x590/0x27bd [btrfs]
[11642.341123] [<ffffffff8147dc00>] ? mutex_unlock+0xe/0x10
[11642.341934] [<ffffffffa00fa6e9>] ? ext4_file_write_iter+0x2a3/0x36f [ext4]
[11642.342936] [<ffffffff8108895d>] ? __lock_is_held+0x3c/0x57
[11642.343772] [<ffffffff81186a1d>] ? rcu_read_unlock+0x3e/0x5d
[11642.344673] [<ffffffff8117dc95>] do_vfs_ioctl+0x458/0x4dc
[11642.346024] [<ffffffff81186bbe>] ? __fget_light+0x62/0x71
[11642.346873] [<ffffffff8117dd70>] SyS_ioctl+0x57/0x79
[11642.347720] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.350222] 4 locks held by fdm-stress/26849:
[11642.350898] #0: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.352375] #1: (&type->i_mutex_dir_key#4/1){+.+.+.}, at: [<ffffffffa0465981>] btrfs_mksubvol+0x4b/0x45d [btrfs]
[11642.354072] #2: (&fs_info->subvol_sem){++++..}, at: [<ffffffffa0465a2a>] btrfs_mksubvol+0xf4/0x45d [btrfs]
[11642.355647] #3: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.357516] INFO: task fdm-stress:26850 blocked for more than 120 seconds.
[11642.358508] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.359376] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.368625] fdm-stress D ffff88021f167688 0 26850 26591 0x00000000
[11642.369716] ffff88021f167688 0000000000000001 0000000000014ec0 ffff88023edd4ec0
[11642.370950] ffff880128a98680 ffff88021f168000 ffff88023edd4ec0 7fffffffffffffff
[11642.372210] 0000000000000002 ffffffff8147b7f9 ffff88021f1676a0 ffffffff8147b541
[11642.373430] Call Trace:
[11642.373853] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.374623] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.375948] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.376862] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.377637] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.378610] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.379457] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.380366] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.381353] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.382255] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.383162] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.383945] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.384875] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.385749] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.386721] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.387596] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.389030] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.389973] [<ffffffff810a25ad>] ? rcu_read_lock_sched_held+0x61/0x69
[11642.390939] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.392271] [<ffffffffa0451c32>] ? __clear_extent_bit+0x26e/0x2c0 [btrfs]
[11642.393305] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.394239] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.395045] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.395991] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.397144] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.398392] [<ffffffffa0452094>] ? clear_extent_bit+0x17/0x19 [btrfs]
[11642.399363] [<ffffffffa0445945>] btrfs_get_blocks_direct+0x12b/0x61c [btrfs]
[11642.400445] [<ffffffff8119f7a1>] ? dio_bio_add_page+0x3d/0x54
[11642.401309] [<ffffffff8119fa93>] ? submit_page_section+0x7b/0x111
[11642.402213] [<ffffffff811a0258>] do_blockdev_direct_IO+0x685/0xc24
[11642.403139] [<ffffffffa044581a>] ? btrfs_page_exists_in_range+0x1a1/0x1a1 [btrfs]
[11642.404360] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.406187] [<ffffffff811a0828>] __blockdev_direct_IO+0x31/0x33
[11642.407070] [<ffffffff811a0828>] ? __blockdev_direct_IO+0x31/0x33
[11642.407990] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.409192] [<ffffffffa043b4ca>] btrfs_direct_IO+0x1c7/0x27e [btrfs]
[11642.410146] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.411291] [<ffffffff81119a2c>] generic_file_read_iter+0x89/0x4e1
[11642.412263] [<ffffffff8108ac05>] ? mark_lock+0x24/0x201
[11642.413057] [<ffffffff8116e1f8>] __vfs_read+0x79/0x9d
[11642.413897] [<ffffffff8116e6f1>] vfs_read+0x8f/0xd2
[11642.414708] [<ffffffff8116ef3d>] SyS_read+0x50/0x7e
[11642.415573] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.416572] 1 lock held by fdm-stress/26850:
[11642.417345] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.418703] INFO: task fdm-stress:26851 blocked for more than 120 seconds.
[11642.419698] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.420612] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.421807] fdm-stress D ffff880196483d28 0 26851 26591 0x00000000
[11642.422878] ffff880196483d28 00ff8801c8f60740 0000000000014ec0 ffff88023ed94ec0
[11642.424149] ffff8801c8f60740 ffff880196484000 0000000000000246 ffff8801c8f60740
[11642.425374] ffff8801bb711840 ffff8801bb711878 ffff880196483d40 ffffffff8147b541
[11642.426591] Call Trace:
[11642.427013] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.427856] [<ffffffff8147b6d5>] schedule_preempt_disabled+0x18/0x24
[11642.428852] [<ffffffff8147c23a>] mutex_lock_nested+0x1d7/0x3b4
[11642.429743] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.430911] [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.432102] [<ffffffffa044f674>] ? btrfs_wait_ordered_roots+0x57/0x191 [btrfs]
[11642.433259] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.434431] [<ffffffffa044f6ea>] btrfs_wait_ordered_roots+0xcd/0x191 [btrfs]
[11642.436079] [<ffffffffa0410cab>] btrfs_sync_fs+0xe0/0x1ad [btrfs]
[11642.437009] [<ffffffff81197900>] ? SyS_tee+0x23c/0x23c
[11642.437860] [<ffffffff81197920>] sync_fs_one_sb+0x20/0x22
[11642.438723] [<ffffffff81171435>] iterate_supers+0x75/0xc2
[11642.439597] [<ffffffff81197d00>] sys_sync+0x52/0x80
[11642.440454] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.441533] 3 locks held by fdm-stress/26851:
[11642.442370] #0: (&type->s_umount_key#37){+++++.}, at: [<ffffffff8117141f>] iterate_supers+0x5f/0xc2
[11642.444043] #1: (&fs_info->ordered_operations_mutex){+.+...}, at: [<ffffffffa044f661>] btrfs_wait_ordered_roots+0x44/0x191 [btrfs]
[11642.446010] #2: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
This happened because under specific timings the path for direct IO reads
can deadlock with concurrent buffered writes. The diagram below shows how
this happens for an example file that has the following layout:
[ extent A ] [ extent B ] [ ....
0K 4K 8K
CPU 1 CPU 2 CPU 3
DIO read against range
[0K, 8K[ starts
btrfs_direct_IO()
--> calls btrfs_get_blocks_direct()
which finds the extent map for the
extent A and leaves the range
[0K, 4K[ locked in the inode's
io tree
buffered write against
range [4K, 8K[ starts
__btrfs_buffered_write()
--> dirties page at 4K
a user space
task calls sync
for e.g or
writepages() is
invoked by mm
writepages()
run_delalloc_range()
cow_file_range()
--> ordered extent X
for the buffered
write is created
and
writeback starts
--> calls btrfs_get_blocks_direct()
again, without submitting first
a bio for reading extent A, and
finds the extent map for extent B
--> calls lock_extent_direct()
--> locks range [4K, 8K[
--> finds ordered extent X
covering range [4K, 8K[
--> unlocks range [4K, 8K[
buffered write against
range [0K, 8K[ starts
__btrfs_buffered_write()
prepare_pages()
--> locks pages with
offsets 0 and 4K
lock_and_cleanup_extent_if_need()
--> blocks attempting to
lock range [0K, 8K[ in
the inode's io tree,
because the range [0, 4K[
is already locked by the
direct IO task at CPU 1
--> calls
btrfs_start_ordered_extent(oe X)
btrfs_start_ordered_extent(oe X)
--> At this point writeback for ordered
extent X has not finished yet
filemap_fdatawrite_range()
btrfs_writepages()
extent_writepages()
extent_write_cache_pages()
--> finds page with offset 0
with the writeback tag
(and not dirty)
--> tries to lock it
--> deadlock, task at CPU 2
has the page locked and
is blocked on the io range
[0, 4K[ that was locked
earlier by this task
So fix this by falling back to a buffered read in the direct IO read path
when an ordered extent for a buffered write is found.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-18 14:28:55 +00:00
|
|
|
else
|
2022-03-23 16:19:24 +00:00
|
|
|
ret = nowait ? -EAGAIN : -ENOTBLK;
|
2012-07-31 16:28:48 -04:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-08 16:23:16 +00:00
|
|
|
* We could trigger writeback for this range (and wait
|
|
|
|
|
* for it to complete) and then invalidate the pages for
|
|
|
|
|
* this range (through invalidate_inode_pages2_range()),
|
|
|
|
|
* but that can lead us to a deadlock with a concurrent
|
2020-06-01 21:47:05 -07:00
|
|
|
* call to readahead (a buffered read or a defrag call
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-08 16:23:16 +00:00
|
|
|
* triggered a readahead) on a page lock due to an
|
|
|
|
|
* ordered dio extent we created before but did not have
|
|
|
|
|
* yet a corresponding bio submitted (whence it can not
|
2020-06-01 21:47:05 -07:00
|
|
|
* complete), which makes readahead wait for that
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-08 16:23:16 +00:00
|
|
|
* ordered extent to complete while holding a lock on
|
|
|
|
|
* that page.
|
2012-07-31 16:28:48 -04:00
|
|
|
*/
|
2022-03-23 16:19:24 +00:00
|
|
|
ret = nowait ? -EAGAIN : -ENOTBLK;
|
2012-07-31 16:28:48 -04:00
|
|
|
}
|
|
|
|
|
|
Btrfs: fix deadlock between direct IO reads and buffered writes
While running a test with a mix of buffered IO and direct IO against
the same files I hit a deadlock reported by the following trace:
[11642.140352] INFO: task kworker/u32:3:15282 blocked for more than 120 seconds.
[11642.142452] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.143982] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.146332] kworker/u32:3 D ffff880230ef7988 [11642.147737] systemd-journald[571]: Sent WATCHDOG=1 notification.
[11642.149771] 0 15282 2 0x00000000
[11642.151205] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs]
[11642.154074] ffff880230ef7988 0000000000000246 0000000000014ec0 ffff88023ec94ec0
[11642.156722] ffff880233fe8f80 ffff880230ef8000 ffff88023ec94ec0 7fffffffffffffff
[11642.159205] 0000000000000002 ffffffff8147b7f9 ffff880230ef79a0 ffffffff8147b541
[11642.161403] Call Trace:
[11642.162129] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.163396] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.164871] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.167020] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.167931] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.182320] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.183762] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.185308] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.186782] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.188217] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.189626] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.190803] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.192158] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.193379] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.194831] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.197068] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.199188] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.200723] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.202465] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.203836] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.205624] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.207057] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.208529] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.210375] [<ffffffffa0462613>] ? btrfs_scrubparity_helper+0x140/0x33a [btrfs]
[11642.212132] [<ffffffffa044f974>] btrfs_run_ordered_extent_work+0x25/0x34 [btrfs]
[11642.213837] [<ffffffffa046262f>] btrfs_scrubparity_helper+0x15c/0x33a [btrfs]
[11642.215457] [<ffffffffa046293b>] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
[11642.217095] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.218324] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.219466] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.220801] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.222032] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.223190] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.224394] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.226295] 2 locks held by kworker/u32:3/15282:
[11642.227273] #0: ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.229412] #1: ((&work->normal_work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.231414] INFO: task kworker/u32:8:15289 blocked for more than 120 seconds.
[11642.232872] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.234109] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.235776] kworker/u32:8 D ffff88020de5f848 0 15289 2 0x00000000
[11642.237412] Workqueue: writeback wb_workfn (flush-btrfs-481)
[11642.238670] ffff88020de5f848 0000000000000246 0000000000014ec0 ffff88023ed54ec0
[11642.240475] ffff88021b1ece40 ffff88020de60000 ffff88023ed54ec0 7fffffffffffffff
[11642.242154] 0000000000000002 ffffffff8147b7f9 ffff88020de5f860 ffffffff8147b541
[11642.243715] Call Trace:
[11642.244390] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.245432] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.246392] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.247479] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.248551] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.249968] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.251043] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.252202] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.253210] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.254307] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.256118] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.257131] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.258200] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.259168] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.260516] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.261841] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.263531] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.264747] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.266148] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.267264] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.268280] [<ffffffff81192a2b>] __writeback_single_inode+0xda/0x5ba
[11642.269407] [<ffffffff811939f0>] writeback_sb_inodes+0x27b/0x43d
[11642.270476] [<ffffffff81193c28>] __writeback_inodes_wb+0x76/0xae
[11642.271547] [<ffffffff81193ea6>] wb_writeback+0x19e/0x41c
[11642.272588] [<ffffffff81194821>] wb_workfn+0x201/0x341
[11642.273523] [<ffffffff81194821>] ? wb_workfn+0x201/0x341
[11642.274479] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.275497] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.276518] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.277520] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.278517] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.279371] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.280468] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.281607] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.282604] 3 locks held by kworker/u32:8/15289:
[11642.283423] #0: ("writeback"){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.285629] #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.287538] #2: (&type->s_umount_key#37){+++++.}, at: [<ffffffff81171217>] trylock_super+0x1b/0x4b
[11642.289423] INFO: task fdm-stress:26848 blocked for more than 120 seconds.
[11642.290547] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.291453] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.292864] fdm-stress D ffff88022c107c20 0 26848 26591 0x00000000
[11642.294118] ffff88022c107c20 000000038108affa 0000000000014ec0 ffff88023ed54ec0
[11642.295602] ffff88013ab1ca40 ffff88022c108000 ffff8800b2fc19d0 00000000000e0fff
[11642.297098] ffff8800b2fc19b0 ffff88022c107c88 ffff88022c107c38 ffffffff8147b541
[11642.298433] Call Trace:
[11642.298896] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.299738] [<ffffffffa045225d>] lock_extent_bits+0xfe/0x1a3 [btrfs]
[11642.300833] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.301943] [<ffffffffa0447516>] lock_and_cleanup_extent_if_need+0x68/0x18e [btrfs]
[11642.303270] [<ffffffffa04485ba>] __btrfs_buffered_write+0x238/0x4c1 [btrfs]
[11642.304552] [<ffffffffa044b50a>] ? btrfs_file_write_iter+0x17c/0x408 [btrfs]
[11642.305782] [<ffffffffa044b682>] btrfs_file_write_iter+0x2f4/0x408 [btrfs]
[11642.306878] [<ffffffff8116e298>] __vfs_write+0x7c/0xa5
[11642.307729] [<ffffffff8116e7d1>] vfs_write+0x9d/0xe8
[11642.308602] [<ffffffff8116efbb>] SyS_write+0x50/0x7e
[11642.309410] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.310403] 3 locks held by fdm-stress/26848:
[11642.311108] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.312578] #1: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.314170] #2: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa044b401>] btrfs_file_write_iter+0x73/0x408 [btrfs]
[11642.316796] INFO: task fdm-stress:26849 blocked for more than 120 seconds.
[11642.317842] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.318691] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.319959] fdm-stress D ffff8801964ffa68 0 26849 26591 0x00000000
[11642.321312] ffff8801964ffa68 00ff8801e9975f80 0000000000014ec0 ffff88023ed94ec0
[11642.322555] ffff8800b00b4840 ffff880196500000 ffff8801e9975f20 0000000000000002
[11642.323715] ffff8801e9975f18 ffff8800b00b4840 ffff8801964ffa80 ffffffff8147b541
[11642.325096] Call Trace:
[11642.325532] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.326303] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.327180] [<ffffffff8108ae40>] ? mark_held_locks+0x5e/0x74
[11642.328114] [<ffffffff8147f30e>] ? _raw_spin_unlock_irq+0x2c/0x4a
[11642.329051] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.330053] [<ffffffff8147bceb>] __wait_for_common+0x109/0x147
[11642.330952] [<ffffffff8147bceb>] ? __wait_for_common+0x109/0x147
[11642.331869] [<ffffffff8147e7bb>] ? usleep_range+0x4a/0x4a
[11642.332925] [<ffffffff81074075>] ? wake_up_q+0x47/0x47
[11642.333736] [<ffffffff8147bd4d>] wait_for_completion+0x24/0x26
[11642.334672] [<ffffffffa044f5ce>] btrfs_wait_ordered_extents+0x1c8/0x217 [btrfs]
[11642.335858] [<ffffffffa0465b5a>] btrfs_mksubvol+0x224/0x45d [btrfs]
[11642.336854] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.337820] [<ffffffffa0465edb>] btrfs_ioctl_snap_create_transid+0x148/0x17a [btrfs]
[11642.339026] [<ffffffffa046603b>] btrfs_ioctl_snap_create_v2+0xc7/0x110 [btrfs]
[11642.340214] [<ffffffffa0468582>] btrfs_ioctl+0x590/0x27bd [btrfs]
[11642.341123] [<ffffffff8147dc00>] ? mutex_unlock+0xe/0x10
[11642.341934] [<ffffffffa00fa6e9>] ? ext4_file_write_iter+0x2a3/0x36f [ext4]
[11642.342936] [<ffffffff8108895d>] ? __lock_is_held+0x3c/0x57
[11642.343772] [<ffffffff81186a1d>] ? rcu_read_unlock+0x3e/0x5d
[11642.344673] [<ffffffff8117dc95>] do_vfs_ioctl+0x458/0x4dc
[11642.346024] [<ffffffff81186bbe>] ? __fget_light+0x62/0x71
[11642.346873] [<ffffffff8117dd70>] SyS_ioctl+0x57/0x79
[11642.347720] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.350222] 4 locks held by fdm-stress/26849:
[11642.350898] #0: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.352375] #1: (&type->i_mutex_dir_key#4/1){+.+.+.}, at: [<ffffffffa0465981>] btrfs_mksubvol+0x4b/0x45d [btrfs]
[11642.354072] #2: (&fs_info->subvol_sem){++++..}, at: [<ffffffffa0465a2a>] btrfs_mksubvol+0xf4/0x45d [btrfs]
[11642.355647] #3: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.357516] INFO: task fdm-stress:26850 blocked for more than 120 seconds.
[11642.358508] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.359376] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.368625] fdm-stress D ffff88021f167688 0 26850 26591 0x00000000
[11642.369716] ffff88021f167688 0000000000000001 0000000000014ec0 ffff88023edd4ec0
[11642.370950] ffff880128a98680 ffff88021f168000 ffff88023edd4ec0 7fffffffffffffff
[11642.372210] 0000000000000002 ffffffff8147b7f9 ffff88021f1676a0 ffffffff8147b541
[11642.373430] Call Trace:
[11642.373853] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.374623] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.375948] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.376862] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.377637] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.378610] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.379457] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.380366] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.381353] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.382255] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.383162] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.383945] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.384875] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.385749] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.386721] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.387596] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.389030] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.389973] [<ffffffff810a25ad>] ? rcu_read_lock_sched_held+0x61/0x69
[11642.390939] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.392271] [<ffffffffa0451c32>] ? __clear_extent_bit+0x26e/0x2c0 [btrfs]
[11642.393305] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.394239] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.395045] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.395991] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.397144] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.398392] [<ffffffffa0452094>] ? clear_extent_bit+0x17/0x19 [btrfs]
[11642.399363] [<ffffffffa0445945>] btrfs_get_blocks_direct+0x12b/0x61c [btrfs]
[11642.400445] [<ffffffff8119f7a1>] ? dio_bio_add_page+0x3d/0x54
[11642.401309] [<ffffffff8119fa93>] ? submit_page_section+0x7b/0x111
[11642.402213] [<ffffffff811a0258>] do_blockdev_direct_IO+0x685/0xc24
[11642.403139] [<ffffffffa044581a>] ? btrfs_page_exists_in_range+0x1a1/0x1a1 [btrfs]
[11642.404360] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.406187] [<ffffffff811a0828>] __blockdev_direct_IO+0x31/0x33
[11642.407070] [<ffffffff811a0828>] ? __blockdev_direct_IO+0x31/0x33
[11642.407990] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.409192] [<ffffffffa043b4ca>] btrfs_direct_IO+0x1c7/0x27e [btrfs]
[11642.410146] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.411291] [<ffffffff81119a2c>] generic_file_read_iter+0x89/0x4e1
[11642.412263] [<ffffffff8108ac05>] ? mark_lock+0x24/0x201
[11642.413057] [<ffffffff8116e1f8>] __vfs_read+0x79/0x9d
[11642.413897] [<ffffffff8116e6f1>] vfs_read+0x8f/0xd2
[11642.414708] [<ffffffff8116ef3d>] SyS_read+0x50/0x7e
[11642.415573] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.416572] 1 lock held by fdm-stress/26850:
[11642.417345] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.418703] INFO: task fdm-stress:26851 blocked for more than 120 seconds.
[11642.419698] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.420612] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.421807] fdm-stress D ffff880196483d28 0 26851 26591 0x00000000
[11642.422878] ffff880196483d28 00ff8801c8f60740 0000000000014ec0 ffff88023ed94ec0
[11642.424149] ffff8801c8f60740 ffff880196484000 0000000000000246 ffff8801c8f60740
[11642.425374] ffff8801bb711840 ffff8801bb711878 ffff880196483d40 ffffffff8147b541
[11642.426591] Call Trace:
[11642.427013] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.427856] [<ffffffff8147b6d5>] schedule_preempt_disabled+0x18/0x24
[11642.428852] [<ffffffff8147c23a>] mutex_lock_nested+0x1d7/0x3b4
[11642.429743] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.430911] [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.432102] [<ffffffffa044f674>] ? btrfs_wait_ordered_roots+0x57/0x191 [btrfs]
[11642.433259] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.434431] [<ffffffffa044f6ea>] btrfs_wait_ordered_roots+0xcd/0x191 [btrfs]
[11642.436079] [<ffffffffa0410cab>] btrfs_sync_fs+0xe0/0x1ad [btrfs]
[11642.437009] [<ffffffff81197900>] ? SyS_tee+0x23c/0x23c
[11642.437860] [<ffffffff81197920>] sync_fs_one_sb+0x20/0x22
[11642.438723] [<ffffffff81171435>] iterate_supers+0x75/0xc2
[11642.439597] [<ffffffff81197d00>] sys_sync+0x52/0x80
[11642.440454] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.441533] 3 locks held by fdm-stress/26851:
[11642.442370] #0: (&type->s_umount_key#37){+++++.}, at: [<ffffffff8117141f>] iterate_supers+0x5f/0xc2
[11642.444043] #1: (&fs_info->ordered_operations_mutex){+.+...}, at: [<ffffffffa044f661>] btrfs_wait_ordered_roots+0x44/0x191 [btrfs]
[11642.446010] #2: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
This happened because under specific timings the path for direct IO reads
can deadlock with concurrent buffered writes. The diagram below shows how
this happens for an example file that has the following layout:
[ extent A ] [ extent B ] [ ....
0K 4K 8K
CPU 1 CPU 2 CPU 3
DIO read against range
[0K, 8K[ starts
btrfs_direct_IO()
--> calls btrfs_get_blocks_direct()
which finds the extent map for the
extent A and leaves the range
[0K, 4K[ locked in the inode's
io tree
buffered write against
range [4K, 8K[ starts
__btrfs_buffered_write()
--> dirties page at 4K
a user space
task calls sync
for e.g or
writepages() is
invoked by mm
writepages()
run_delalloc_range()
cow_file_range()
--> ordered extent X
for the buffered
write is created
and
writeback starts
--> calls btrfs_get_blocks_direct()
again, without submitting first
a bio for reading extent A, and
finds the extent map for extent B
--> calls lock_extent_direct()
--> locks range [4K, 8K[
--> finds ordered extent X
covering range [4K, 8K[
--> unlocks range [4K, 8K[
buffered write against
range [0K, 8K[ starts
__btrfs_buffered_write()
prepare_pages()
--> locks pages with
offsets 0 and 4K
lock_and_cleanup_extent_if_need()
--> blocks attempting to
lock range [0K, 8K[ in
the inode's io tree,
because the range [0, 4K[
is already locked by the
direct IO task at CPU 1
--> calls
btrfs_start_ordered_extent(oe X)
btrfs_start_ordered_extent(oe X)
--> At this point writeback for ordered
extent X has not finished yet
filemap_fdatawrite_range()
btrfs_writepages()
extent_writepages()
extent_write_cache_pages()
--> finds page with offset 0
with the writeback tag
(and not dirty)
--> tries to lock it
--> deadlock, task at CPU 2
has the page locked and
is blocked on the io range
[0, 4K[ that was locked
earlier by this task
So fix this by falling back to a buffered read in the direct IO read path
when an ordered extent for a buffered write is found.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-18 14:28:55 +00:00
|
|
|
if (ret)
|
|
|
|
|
break;
|
|
|
|
|
|
2012-07-31 16:28:48 -04:00
|
|
|
cond_resched();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2017-01-31 07:50:22 -08:00
|
|
|
/* The callers of this must take lock_extent() */
|
2020-06-03 08:55:05 +03:00
|
|
|
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
|
|
|
|
|
u64 len, u64 orig_start, u64 block_start,
|
2017-01-31 07:50:22 -08:00
|
|
|
u64 block_len, u64 orig_block_len,
|
|
|
|
|
u64 ram_bytes, int compress_type,
|
|
|
|
|
int type)
|
2012-09-11 15:40:07 -04:00
|
|
|
{
|
|
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
|
struct extent_map *em;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2017-01-31 07:50:22 -08:00
|
|
|
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
|
|
|
|
|
type == BTRFS_ORDERED_COMPRESSED ||
|
|
|
|
|
type == BTRFS_ORDERED_NOCOW ||
|
2017-02-13 15:35:09 -08:00
|
|
|
type == BTRFS_ORDERED_REGULAR);
|
2017-01-31 07:50:22 -08:00
|
|
|
|
2020-06-03 08:55:05 +03:00
|
|
|
em_tree = &inode->extent_tree;
|
2012-09-11 15:40:07 -04:00
|
|
|
em = alloc_extent_map();
|
|
|
|
|
if (!em)
|
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
|
|
em->start = start;
|
|
|
|
|
em->orig_start = orig_start;
|
|
|
|
|
em->len = len;
|
|
|
|
|
em->block_len = block_len;
|
|
|
|
|
em->block_start = block_start;
|
2012-12-03 10:31:19 -05:00
|
|
|
em->orig_block_len = orig_block_len;
|
2013-04-04 14:31:27 -04:00
|
|
|
em->ram_bytes = ram_bytes;
|
2012-10-11 16:54:30 -04:00
|
|
|
em->generation = -1;
|
2012-09-11 15:40:07 -04:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
2017-02-13 15:35:09 -08:00
|
|
|
if (type == BTRFS_ORDERED_PREALLOC) {
|
2012-12-03 10:58:15 -05:00
|
|
|
set_bit(EXTENT_FLAG_FILLING, &em->flags);
|
2017-02-13 15:35:09 -08:00
|
|
|
} else if (type == BTRFS_ORDERED_COMPRESSED) {
|
2017-01-31 07:50:22 -08:00
|
|
|
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
|
|
|
|
|
em->compress_type = compress_type;
|
|
|
|
|
}
|
2012-09-11 15:40:07 -04:00
|
|
|
|
|
|
|
|
do {
|
2020-06-03 08:55:05 +03:00
|
|
|
btrfs_drop_extent_cache(inode, em->start,
|
|
|
|
|
em->start + em->len - 1, 0);
|
2012-09-11 15:40:07 -04:00
|
|
|
write_lock(&em_tree->lock);
|
2013-04-05 16:51:15 -04:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
2012-09-11 15:40:07 -04:00
|
|
|
write_unlock(&em_tree->lock);
|
2017-01-31 07:50:22 -08:00
|
|
|
/*
|
|
|
|
|
* The caller has taken lock_extent(), who could race with us
|
|
|
|
|
* to add em?
|
|
|
|
|
*/
|
2012-09-11 15:40:07 -04:00
|
|
|
} while (ret == -EEXIST);
|
|
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
}
|
|
|
|
|
|
2017-01-31 07:50:22 -08:00
|
|
|
/* em got 2 refs now, callers needs to do free_extent_map once. */
|
2012-09-11 15:40:07 -04:00
|
|
|
return em;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-02 15:19:32 +03:00
|
|
|
|
2018-05-02 15:19:33 +03:00
|
|
|
static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
|
|
|
|
struct inode *inode,
|
|
|
|
|
struct btrfs_dio_data *dio_data,
|
btrfs: avoid double nocow check when doing nowait dio writes
When doing a NOWAIT direct IO write we are checking twice if we can COW
into the target file range using can_nocow_extent() - once at the very
beginning of the write path, at btrfs_write_check() via
check_nocow_nolock(), and later again at btrfs_get_blocks_direct_write().
The can_nocow_extent() function does a lot of expensive things - searching
for the file extent item in the inode's subvolume tree, searching for the
extent item in the extent tree, checking delayed references, etc, so it
isn't a very cheap call.
We can remove the first check at btrfs_write_check(), and add there a
quick check to verify if the inode has the NODATACOW or PREALLOC flags,
and quickly bail out if it doesn't have neither of those flags, as that
means we have to COW and therefore can't comply with the NOWAIT semantics.
After this we do only one call to can_nocow_extent(), while we are at
btrfs_get_blocks_direct_write(), where we have already locked the file
range and we did a try lock on the range before, at
btrfs_dio_iomap_begin() (since the previous patch in the series).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:25 +00:00
|
|
|
u64 start, u64 len,
|
|
|
|
|
unsigned int iomap_flags)
|
2018-05-02 15:19:33 +03:00
|
|
|
{
|
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
|
|
|
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
|
2018-05-02 15:19:33 +03:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
struct extent_map *em = *map;
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
int type;
|
|
|
|
|
u64 block_start, orig_start, orig_block_len, ram_bytes;
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
struct btrfs_block_group *bg;
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
bool can_nocow = false;
|
|
|
|
|
bool space_reserved = false;
|
2022-03-28 21:32:05 +09:00
|
|
|
u64 prev_len;
|
2018-05-02 15:19:33 +03:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We don't allocate a new extent in the following cases
|
|
|
|
|
*
|
|
|
|
|
* 1) The inode is marked as NODATACOW. In this case we'll just use the
|
|
|
|
|
* existing extent.
|
|
|
|
|
* 2) The extent is marked as PREALLOC. We're good to go here and can
|
|
|
|
|
* just use the extent.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
|
|
|
|
|
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
|
em->block_start != EXTENT_MAP_HOLE)) {
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
|
else
|
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
|
block_start = em->block_start + (start - em->start);
|
|
|
|
|
|
|
|
|
|
if (can_nocow_extent(inode, start, &len, &orig_start,
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
&orig_block_len, &ram_bytes, false) == 1) {
|
|
|
|
|
bg = btrfs_inc_nocow_writers(fs_info, block_start);
|
|
|
|
|
if (bg)
|
|
|
|
|
can_nocow = true;
|
|
|
|
|
}
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
}
|
2018-05-02 15:19:33 +03:00
|
|
|
|
2022-03-28 21:32:05 +09:00
|
|
|
prev_len = len;
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (can_nocow) {
|
|
|
|
|
struct extent_map *em2;
|
|
|
|
|
|
|
|
|
|
/* We can NOCOW, so only need to reserve metadata space. */
|
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
|
|
|
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
|
|
|
|
|
nowait);
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (ret < 0) {
|
|
|
|
|
/* Our caller expects us to free the input extent map. */
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
*map = NULL;
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
btrfs_dec_nocow_writers(bg);
|
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
|
|
|
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
|
|
|
|
|
ret = -EAGAIN;
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
space_reserved = true;
|
|
|
|
|
|
|
|
|
|
em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
|
|
|
|
|
orig_start, block_start,
|
|
|
|
|
len, orig_block_len,
|
|
|
|
|
ram_bytes, type);
|
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-13 16:20:43 +01:00
|
|
|
btrfs_dec_nocow_writers(bg);
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (type == BTRFS_ORDERED_PREALLOC) {
|
|
|
|
|
free_extent_map(em);
|
2022-06-21 18:40:48 +02:00
|
|
|
*map = em2;
|
|
|
|
|
em = em2;
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
}
|
2018-05-02 15:19:33 +03:00
|
|
|
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (IS_ERR(em2)) {
|
|
|
|
|
ret = PTR_ERR(em2);
|
|
|
|
|
goto out;
|
2018-05-02 15:19:33 +03:00
|
|
|
}
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
|
|
|
|
|
dio_data->nocow_done = true;
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
} else {
|
|
|
|
|
/* Our caller expects us to free the input extent map. */
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
*map = NULL;
|
|
|
|
|
|
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
|
|
|
if (nowait)
|
btrfs: avoid double nocow check when doing nowait dio writes
When doing a NOWAIT direct IO write we are checking twice if we can COW
into the target file range using can_nocow_extent() - once at the very
beginning of the write path, at btrfs_write_check() via
check_nocow_nolock(), and later again at btrfs_get_blocks_direct_write().
The can_nocow_extent() function does a lot of expensive things - searching
for the file extent item in the inode's subvolume tree, searching for the
extent item in the extent tree, checking delayed references, etc, so it
isn't a very cheap call.
We can remove the first check at btrfs_write_check(), and add there a
quick check to verify if the inode has the NODATACOW or PREALLOC flags,
and quickly bail out if it doesn't have neither of those flags, as that
means we have to COW and therefore can't comply with the NOWAIT semantics.
After this we do only one call to can_nocow_extent(), while we are at
btrfs_get_blocks_direct_write(), where we have already locked the file
range and we did a try lock on the range before, at
btrfs_dio_iomap_begin() (since the previous patch in the series).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:25 +00:00
|
|
|
return -EAGAIN;
|
|
|
|
|
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
/*
|
|
|
|
|
* If we could not allocate data space before locking the file
|
|
|
|
|
* range and we can't do a NOCOW write, then we have to fail.
|
|
|
|
|
*/
|
|
|
|
|
if (!dio_data->data_space_reserved)
|
|
|
|
|
return -ENOSPC;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We have to COW and we have already reserved data space before,
|
|
|
|
|
* so now we reserve only metadata.
|
|
|
|
|
*/
|
|
|
|
|
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
|
|
|
|
|
false);
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (ret < 0)
|
|
|
|
|
goto out;
|
|
|
|
|
space_reserved = true;
|
|
|
|
|
|
|
|
|
|
em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
*map = em;
|
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
|
if (len < prev_len)
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
btrfs_delalloc_release_metadata(BTRFS_I(inode),
|
|
|
|
|
prev_len - len, true);
|
2018-05-02 15:19:33 +03:00
|
|
|
}
|
|
|
|
|
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
/*
|
|
|
|
|
* We have created our ordered extent, so we can now release our reservation
|
|
|
|
|
* for an outstanding extent.
|
|
|
|
|
*/
|
2022-03-28 21:32:05 +09:00
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
|
2018-05-02 15:19:33 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Need to update the i_size under the extent lock so buffered
|
|
|
|
|
* readers will get the updated i_size when we unlock.
|
|
|
|
|
*/
|
2020-08-17 11:18:21 -05:00
|
|
|
if (start + len > i_size_read(inode))
|
2018-05-02 15:19:33 +03:00
|
|
|
i_size_write(inode, start + len);
|
|
|
|
|
out:
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (ret && space_reserved) {
|
|
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
}
|
2018-05-02 15:19:33 +03:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-17 11:18:21 -05:00
|
|
|
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
|
|
|
|
|
loff_t length, unsigned int flags, struct iomap *iomap,
|
|
|
|
|
struct iomap *srcmap)
|
2010-05-23 11:00:55 -04:00
|
|
|
{
|
2022-05-05 15:11:12 -05:00
|
|
|
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2010-05-23 11:00:55 -04:00
|
|
|
struct extent_map *em;
|
2012-07-31 16:28:48 -04:00
|
|
|
struct extent_state *cached_state = NULL;
|
2022-05-05 15:11:12 -05:00
|
|
|
struct btrfs_dio_data *dio_data = iter->private;
|
2012-07-31 16:28:48 -04:00
|
|
|
u64 lockstart, lockend;
|
2020-08-17 11:18:21 -05:00
|
|
|
const bool write = !!(flags & IOMAP_WRITE);
|
2013-02-07 10:12:07 +00:00
|
|
|
int ret = 0;
|
2020-08-17 11:18:21 -05:00
|
|
|
u64 len = length;
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
const u64 data_alloc_len = length;
|
2020-08-17 11:18:21 -05:00
|
|
|
bool unlock_extents = false;
|
2012-07-31 16:28:48 -04:00
|
|
|
|
btrfs: don't allow large NOWAIT direct reads
Dylan and Jens reported a problem where they had an io_uring test that
was returning short reads, and bisected it to ee5b46a353af ("btrfs:
increase direct io read size limit to 256 sectors").
The root cause is their test was doing larger reads via io_uring with
NOWAIT and async. This was triggering a page fault during the direct
read, however the first page was able to work just fine and thus we
submitted a 4k read for a larger iocb.
Btrfs allows for partial IO's in this case specifically because we don't
allow page faults, and thus we'll attempt to do any io that we can,
submit what we could, come back and fault in the rest of the range and
try to do the remaining IO.
However for !is_sync_kiocb() we'll call ->ki_complete() as soon as the
partial dio is done, which is incorrect. In the sync case we can exit
the iomap code, submit more io's, and return with the amount of IO we
were able to complete successfully.
We were always doing short reads in this case, but for NOWAIT we were
getting saved by the fact that we were limiting direct reads to
sectorsize, and if we were larger than that we would return EAGAIN.
Fix the regression by simply returning EAGAIN in the NOWAIT case with
larger reads, that way io_uring can retry and get the larger IO and have
the fault logic handle everything properly.
This still leaves the AIO short read case, but that existed before this
change. The way to properly fix this would be to handle partial iocb
completions, but that's a lot of work, for now deal with the regression
in the most straightforward way possible.
Reported-by: Dylan Yudaken <dylany@fb.com>
Fixes: ee5b46a353af ("btrfs: increase direct io read size limit to 256 sectors")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-08-19 11:53:39 -04:00
|
|
|
/*
|
|
|
|
|
* We could potentially fault if we have a buffer > PAGE_SIZE, and if
|
|
|
|
|
* we're NOWAIT we may submit a bio for a partial range and return
|
|
|
|
|
* EIOCBQUEUED, which would result in an errant short read.
|
|
|
|
|
*
|
|
|
|
|
* The best way to handle this would be to allow for partial completions
|
|
|
|
|
* of iocb's, so we could submit the partial bio, return and fault in
|
|
|
|
|
* the rest of the pages, and then submit the io for the rest of the
|
|
|
|
|
* range. However we don't have that currently, so simply return
|
|
|
|
|
* -EAGAIN at this point so that the normal path is used.
|
|
|
|
|
*/
|
|
|
|
|
if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
btrfs: increase direct io read size limit to 256 sectors
Btrfs currently limits direct I/O reads to a single sector, which goes
back to commit c329861da406 ("Btrfs: don't allocate a separate csums
array for direct reads") from Josef. That commit changes the direct I/O
code to ".. use the private part of the io_tree for our csums.", but ten
years later that isn't how checksums for direct reads work, instead they
use a csums allocation on a per-btrfs_dio_private basis (which have their
own performance problem for small I/O, but that will be addressed later).
There is no fundamental limit in btrfs itself to limit the I/O size
except for the size of the checksum array that scales linearly with
the number of sectors in an I/O. Pick a somewhat arbitrary limit of
256 limits, which matches what the buffered reads typically see as
the upper limit as the limit for direct I/O as well.
This significantly improves direct read performance. For example a fio
run doing 1 MiB aio reads with a queue depth of 1 roughly triples the
throughput:
Baseline:
READ: bw=65.3MiB/s (68.5MB/s), 65.3MiB/s-65.3MiB/s (68.5MB/s-68.5MB/s), io=19.1GiB (20.6GB), run=300013-300013msec
With this patch:
READ: bw=196MiB/s (206MB/s), 196MiB/s-196MiB/s (206MB/s-206MB/s), io=57.5GiB (61.7GB), run=300006-300006msc
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 08:26:27 +02:00
|
|
|
/*
|
|
|
|
|
* Cap the size of reads to that usually seen in buffered I/O as we need
|
|
|
|
|
* to allocate a contiguous array for the checksums.
|
|
|
|
|
*/
|
2020-08-17 11:18:21 -05:00
|
|
|
if (!write)
|
btrfs: increase direct io read size limit to 256 sectors
Btrfs currently limits direct I/O reads to a single sector, which goes
back to commit c329861da406 ("Btrfs: don't allocate a separate csums
array for direct reads") from Josef. That commit changes the direct I/O
code to ".. use the private part of the io_tree for our csums.", but ten
years later that isn't how checksums for direct reads work, instead they
use a csums allocation on a per-btrfs_dio_private basis (which have their
own performance problem for small I/O, but that will be addressed later).
There is no fundamental limit in btrfs itself to limit the I/O size
except for the size of the checksum array that scales linearly with
the number of sectors in an I/O. Pick a somewhat arbitrary limit of
256 limits, which matches what the buffered reads typically see as
the upper limit as the limit for direct I/O as well.
This significantly improves direct read performance. For example a fio
run doing 1 MiB aio reads with a queue depth of 1 roughly triples the
throughput:
Baseline:
READ: bw=65.3MiB/s (68.5MB/s), 65.3MiB/s-65.3MiB/s (68.5MB/s-68.5MB/s), io=19.1GiB (20.6GB), run=300013-300013msec
With this patch:
READ: bw=196MiB/s (206MB/s), 196MiB/s-196MiB/s (206MB/s-206MB/s), io=57.5GiB (61.7GB), run=300006-300006msc
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-21 08:26:27 +02:00
|
|
|
len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
|
2012-07-31 16:28:48 -04:00
|
|
|
|
2012-08-03 16:49:19 -04:00
|
|
|
lockstart = start;
|
|
|
|
|
lockend = start + len - 1;
|
|
|
|
|
|
2020-08-17 11:18:21 -05:00
|
|
|
/*
|
btrfs: avoid blocking on page locks with nowait dio on compressed range
If we are doing NOWAIT direct IO read/write and our inode has compressed
extents, we call filemap_fdatawrite_range() against the range in order
to wait for compressed writeback to complete, since the generic code at
iomap_dio_rw() calls filemap_write_and_wait_range() once, which is not
enough to wait for compressed writeback to complete.
This call to filemap_fdatawrite_range() can block on page locks, since
the first writepages() on a range that we will try to compress results
only in queuing a work to compress the data while holding the pages
locked.
Even though the generic code at iomap_dio_rw() will do the right thing
and return -EAGAIN for NOWAIT requests in case there are pages in the
range, we can still end up at btrfs_dio_iomap_begin() with pages in the
range because either of the following can happen:
1) Memory mapped writes, as we haven't locked the range yet;
2) Buffered reads might have started, which lock the pages, and we do
the filemap_fdatawrite_range() call before locking the file range.
So don't call filemap_fdatawrite_range() at btrfs_dio_iomap_begin() if we
are doing a NOWAIT read/write. Instead call filemap_range_needs_writeback()
to check if there are any locked, dirty, or under writeback pages, and
return -EAGAIN if that's the case.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:23 +00:00
|
|
|
* iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
|
|
|
|
|
* enough if we've written compressed pages to this area, so we need to
|
|
|
|
|
* flush the dirty pages again to make absolutely sure that any
|
|
|
|
|
* outstanding dirty pages are on disk - the first flush only starts
|
|
|
|
|
* compression on the data, while keeping the pages locked, so by the
|
|
|
|
|
* time the second flush returns we know bios for the compressed pages
|
|
|
|
|
* were submitted and finished, and the pages no longer under writeback.
|
|
|
|
|
*
|
|
|
|
|
* If we have a NOWAIT request and we have any pages in the range that
|
|
|
|
|
* are locked, likely due to compression still in progress, we don't want
|
|
|
|
|
* to block on page locks. We also don't want to block on pages marked as
|
|
|
|
|
* dirty or under writeback (same as for the non-compression case).
|
|
|
|
|
* iomap_dio_rw() did the same check, but after that and before we got
|
|
|
|
|
* here, mmap'ed writes may have happened or buffered reads started
|
|
|
|
|
* (readpage() and readahead(), which lock pages), as we haven't locked
|
|
|
|
|
* the file range yet.
|
2020-08-17 11:18:21 -05:00
|
|
|
*/
|
|
|
|
|
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
btrfs: avoid blocking on page locks with nowait dio on compressed range
If we are doing NOWAIT direct IO read/write and our inode has compressed
extents, we call filemap_fdatawrite_range() against the range in order
to wait for compressed writeback to complete, since the generic code at
iomap_dio_rw() calls filemap_write_and_wait_range() once, which is not
enough to wait for compressed writeback to complete.
This call to filemap_fdatawrite_range() can block on page locks, since
the first writepages() on a range that we will try to compress results
only in queuing a work to compress the data while holding the pages
locked.
Even though the generic code at iomap_dio_rw() will do the right thing
and return -EAGAIN for NOWAIT requests in case there are pages in the
range, we can still end up at btrfs_dio_iomap_begin() with pages in the
range because either of the following can happen:
1) Memory mapped writes, as we haven't locked the range yet;
2) Buffered reads might have started, which lock the pages, and we do
the filemap_fdatawrite_range() call before locking the file range.
So don't call filemap_fdatawrite_range() at btrfs_dio_iomap_begin() if we
are doing a NOWAIT read/write. Instead call filemap_range_needs_writeback()
to check if there are any locked, dirty, or under writeback pages, and
return -EAGAIN if that's the case.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:23 +00:00
|
|
|
if (flags & IOMAP_NOWAIT) {
|
|
|
|
|
if (filemap_range_needs_writeback(inode->i_mapping,
|
|
|
|
|
lockstart, lockend))
|
|
|
|
|
return -EAGAIN;
|
|
|
|
|
} else {
|
|
|
|
|
ret = filemap_fdatawrite_range(inode->i_mapping, start,
|
|
|
|
|
start + length - 1);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2020-08-17 11:18:21 -05:00
|
|
|
}
|
|
|
|
|
|
2022-05-05 15:11:12 -05:00
|
|
|
memset(dio_data, 0, sizeof(*dio_data));
|
2020-08-17 11:18:21 -05:00
|
|
|
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
/*
|
|
|
|
|
* We always try to allocate data space and must do it before locking
|
|
|
|
|
* the file range, to avoid deadlocks with concurrent writes to the same
|
|
|
|
|
* range if the range has several extents and the writes don't expand the
|
|
|
|
|
* current i_size (the inode lock is taken in shared mode). If we fail to
|
|
|
|
|
* allocate data space here we continue and later, after locking the
|
|
|
|
|
* file range, we fail with ENOSPC only if we figure out we can not do a
|
|
|
|
|
* NOCOW write.
|
|
|
|
|
*/
|
|
|
|
|
if (write && !(flags & IOMAP_NOWAIT)) {
|
|
|
|
|
ret = btrfs_check_data_free_space(BTRFS_I(inode),
|
|
|
|
|
&dio_data->data_reserved,
|
|
|
|
|
start, data_alloc_len);
|
|
|
|
|
if (!ret)
|
|
|
|
|
dio_data->data_space_reserved = true;
|
|
|
|
|
else if (ret && !(BTRFS_I(inode)->flags &
|
|
|
|
|
(BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
|
|
|
|
|
goto err;
|
|
|
|
|
}
|
2015-03-17 10:52:28 -04:00
|
|
|
|
2012-07-31 16:28:48 -04:00
|
|
|
/*
|
|
|
|
|
* If this errors out it's because we couldn't invalidate pagecache for
|
2022-03-23 16:19:24 +00:00
|
|
|
* this range and we need to fallback to buffered IO, or we are doing a
|
|
|
|
|
* NOWAIT read/write and we need to block.
|
2012-07-31 16:28:48 -04:00
|
|
|
*/
|
2022-03-23 16:19:24 +00:00
|
|
|
ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
|
|
|
|
|
if (ret < 0)
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 09:52:04 +00:00
|
|
|
goto err;
|
2012-07-31 16:28:48 -04:00
|
|
|
|
2019-12-02 17:34:23 -08:00
|
|
|
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
|
2012-07-31 16:28:48 -04:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto unlock_err;
|
|
|
|
|
}
|
2010-05-23 11:00:55 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
|
|
|
|
|
* io. INLINE is special, and we could probably kludge it in here, but
|
|
|
|
|
* it's still buffered so for safety lets just fall back to the generic
|
|
|
|
|
* buffered path.
|
|
|
|
|
*
|
|
|
|
|
* For COMPRESSED we _have_ to read the entire extent in so we can
|
|
|
|
|
* decompress it, so there will be buffering required no matter what we
|
|
|
|
|
* do, so go ahead and fallback to buffered.
|
|
|
|
|
*
|
2016-05-19 21:18:45 -04:00
|
|
|
* We return -ENOTBLK because that's what makes DIO go ahead and go back
|
2010-05-23 11:00:55 -04:00
|
|
|
* to buffered IO. Don't blame me, this is the price we pay for using
|
|
|
|
|
* the generic code.
|
|
|
|
|
*/
|
|
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
|
|
|
|
|
em->block_start == EXTENT_MAP_INLINE) {
|
|
|
|
|
free_extent_map(em);
|
btrfs: return -EAGAIN for NOWAIT dio reads/writes on compressed and inline extents
When doing a direct IO read or write, we always return -ENOTBLK when we
find a compressed extent (or an inline extent) so that we fallback to
buffered IO. This however is not ideal in case we are in a NOWAIT context
(io_uring for example), because buffered IO can block and we currently
have no support for NOWAIT semantics for buffered IO, so if we need to
fallback to buffered IO we should first signal the caller that we may
need to block by returning -EAGAIN instead.
This behaviour can also result in short reads being returned to user
space, which although it's not incorrect and user space should be able
to deal with partial reads, it's somewhat surprising and even some popular
applications like QEMU (Link tag #1) and MariaDB (Link tag #2) don't
deal with short reads properly (or at all).
The short read case happens when we try to read from a range that has a
non-compressed and non-inline extent followed by a compressed extent.
After having read the first extent, when we find the compressed extent we
return -ENOTBLK from btrfs_dio_iomap_begin(), which results in iomap to
treat the request as a short read, returning 0 (success) and waiting for
previously submitted bios to complete (this happens at
fs/iomap/direct-io.c:__iomap_dio_rw()). After that, and while at
btrfs_file_read_iter(), we call filemap_read() to use buffered IO to
read the remaining data, and pass it the number of bytes we were able to
read with direct IO. Than at filemap_read() if we get a page fault error
when accessing the read buffer, we return a partial read instead of an
-EFAULT error, because the number of bytes previously read is greater
than zero.
So fix this by returning -EAGAIN for NOWAIT direct IO when we find a
compressed or an inline extent.
Reported-by: Dominique MARTINET <dominique.martinet@atmark-techno.com>
Link: https://lore.kernel.org/linux-btrfs/YrrFGO4A1jS0GI0G@atmark-techno.com/
Link: https://jira.mariadb.org/browse/MDEV-27900?focusedCommentId=216582&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-216582
Tested-by: Dominique MARTINET <dominique.martinet@atmark-techno.com>
CC: stable@vger.kernel.org # 5.10+
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-07-04 12:42:03 +01:00
|
|
|
/*
|
|
|
|
|
* If we are in a NOWAIT context, return -EAGAIN in order to
|
|
|
|
|
* fallback to buffered IO. This is not only because we can
|
|
|
|
|
* block with buffered IO (no support for NOWAIT semantics at
|
|
|
|
|
* the moment) but also to avoid returning short reads to user
|
|
|
|
|
* space - this happens if we were able to read some data from
|
|
|
|
|
* previous non-compressed extents and then when we fallback to
|
|
|
|
|
* buffered IO, at btrfs_file_read_iter() by calling
|
|
|
|
|
* filemap_read(), we fail to fault in pages for the read buffer,
|
|
|
|
|
* in which case filemap_read() returns a short read (the number
|
|
|
|
|
* of bytes previously read is > 0, so it does not return -EFAULT).
|
|
|
|
|
*/
|
|
|
|
|
ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
|
2012-07-31 16:28:48 -04:00
|
|
|
goto unlock_err;
|
2010-05-23 11:00:55 -04:00
|
|
|
}
|
|
|
|
|
|
2020-08-17 11:18:21 -05:00
|
|
|
len = min(len, em->len - (start - em->start));
|
btrfs: fallback to blocking mode when doing async dio over multiple extents
Some users recently reported that MariaDB was getting a read corruption
when using io_uring on top of btrfs. This started to happen in 5.16,
after commit 51bd9563b6783d ("btrfs: fix deadlock due to page faults
during direct IO reads and writes"). That changed btrfs to use the new
iomap flag IOMAP_DIO_PARTIAL and to disable page faults before calling
iomap_dio_rw(). This was necessary to fix deadlocks when the iovector
corresponds to a memory mapped file region. That type of scenario is
exercised by test case generic/647 from fstests.
For this MariaDB scenario, we attempt to read 16K from file offset X
using IOCB_NOWAIT and io_uring. In that range we have 4 extents, each
with a size of 4K, and what happens is the following:
1) btrfs_direct_read() disables page faults and calls iomap_dio_rw();
2) iomap creates a struct iomap_dio object, its reference count is
initialized to 1 and its ->size field is initialized to 0;
3) iomap calls btrfs_dio_iomap_begin() with file offset X, which finds
the first 4K extent, and setups an iomap for this extent consisting
of a single page;
4) At iomap_dio_bio_iter(), we are able to access the first page of the
buffer (struct iov_iter) with bio_iov_iter_get_pages() without
triggering a page fault;
5) iomap submits a bio for this 4K extent
(iomap_dio_submit_bio() -> btrfs_submit_direct()) and increments
the refcount on the struct iomap_dio object to 2; The ->size field
of the struct iomap_dio object is incremented to 4K;
6) iomap calls btrfs_iomap_begin() again, this time with a file
offset of X + 4K. There we setup an iomap for the next extent
that also has a size of 4K;
7) Then at iomap_dio_bio_iter() we call bio_iov_iter_get_pages(),
which tries to access the next page (2nd page) of the buffer.
This triggers a page fault and returns -EFAULT;
8) At __iomap_dio_rw() we see the -EFAULT, but we reset the error
to 0 because we passed the flag IOMAP_DIO_PARTIAL to iomap and
the struct iomap_dio object has a ->size value of 4K (we submitted
a bio for an extent already). The 'wait_for_completion' variable
is not set to true, because our iocb has IOCB_NOWAIT set;
9) At the bottom of __iomap_dio_rw(), we decrement the reference count
of the struct iomap_dio object from 2 to 1. Because we were not
the only ones holding a reference on it and 'wait_for_completion' is
set to false, -EIOCBQUEUED is returned to btrfs_direct_read(), which
just returns it up the callchain, up to io_uring;
10) The bio submitted for the first extent (step 5) completes and its
bio endio function, iomap_dio_bio_end_io(), decrements the last
reference on the struct iomap_dio object, resulting in calling
iomap_dio_complete_work() -> iomap_dio_complete().
11) At iomap_dio_complete() we adjust the iocb->ki_pos from X to X + 4K
and return 4K (the amount of io done) to iomap_dio_complete_work();
12) iomap_dio_complete_work() calls the iocb completion callback,
iocb->ki_complete() with a second argument value of 4K (total io
done) and the iocb with the adjust ki_pos of X + 4K. This results
in completing the read request for io_uring, leaving it with a
result of 4K bytes read, and only the first page of the buffer
filled in, while the remaining 3 pages, corresponding to the other
3 extents, were not filled;
13) For the application, the result is unexpected because if we ask
to read N bytes, it expects to get N bytes read as long as those
N bytes don't cross the EOF (i_size).
MariaDB reports this as an error, as it's not expecting a short read,
since it knows it's asking for read operations fully within the i_size
boundary. This is typical in many applications, but it may also be
questionable if they should react to such short reads by issuing more
read calls to get the remaining data. Nevertheless, the short read
happened due to a change in btrfs regarding how it deals with page
faults while in the middle of a read operation, and there's no reason
why btrfs can't have the previous behaviour of returning the whole data
that was requested by the application.
The problem can also be triggered with the following simple program:
/* Get O_DIRECT */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <liburing.h>
int main(int argc, char *argv[])
{
char *foo_path;
struct io_uring ring;
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
struct iovec iovec;
int fd;
long pagesize;
void *write_buf;
void *read_buf;
ssize_t ret;
int i;
if (argc != 2) {
fprintf(stderr, "Use: %s <directory>\n", argv[0]);
return 1;
}
foo_path = malloc(strlen(argv[1]) + 5);
if (!foo_path) {
fprintf(stderr, "Failed to allocate memory for file path\n");
return 1;
}
strcpy(foo_path, argv[1]);
strcat(foo_path, "/foo");
/*
* Create file foo with 2 extents, each with a size matching
* the page size. Then allocate a buffer to read both extents
* with io_uring, using O_DIRECT and IOCB_NOWAIT. Before doing
* the read with io_uring, access the first page of the buffer
* to fault it in, so that during the read we only trigger a
* page fault when accessing the second page of the buffer.
*/
fd = open(foo_path, O_CREAT | O_TRUNC | O_WRONLY |
O_DIRECT, 0666);
if (fd == -1) {
fprintf(stderr,
"Failed to create file 'foo': %s (errno %d)",
strerror(errno), errno);
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
ret = posix_memalign(&write_buf, pagesize, 2 * pagesize);
if (ret) {
fprintf(stderr, "Failed to allocate write buffer\n");
return 1;
}
memset(write_buf, 0xab, pagesize);
memset(write_buf + pagesize, 0xcd, pagesize);
/* Create 2 extents, each with a size matching page size. */
for (i = 0; i < 2; i++) {
ret = pwrite(fd, write_buf + i * pagesize, pagesize,
i * pagesize);
if (ret != pagesize) {
fprintf(stderr,
"Failed to write to file, ret = %ld errno %d (%s)\n",
ret, errno, strerror(errno));
return 1;
}
ret = fsync(fd);
if (ret != 0) {
fprintf(stderr, "Failed to fsync file\n");
return 1;
}
}
close(fd);
fd = open(foo_path, O_RDONLY | O_DIRECT);
if (fd == -1) {
fprintf(stderr,
"Failed to open file 'foo': %s (errno %d)",
strerror(errno), errno);
return 1;
}
ret = posix_memalign(&read_buf, pagesize, 2 * pagesize);
if (ret) {
fprintf(stderr, "Failed to allocate read buffer\n");
return 1;
}
/*
* Fault in only the first page of the read buffer.
* We want to trigger a page fault for the 2nd page of the
* read buffer during the read operation with io_uring
* (O_DIRECT and IOCB_NOWAIT).
*/
memset(read_buf, 0, 1);
ret = io_uring_queue_init(1, &ring, 0);
if (ret != 0) {
fprintf(stderr, "Failed to create io_uring queue\n");
return 1;
}
sqe = io_uring_get_sqe(&ring);
if (!sqe) {
fprintf(stderr, "Failed to get io_uring sqe\n");
return 1;
}
iovec.iov_base = read_buf;
iovec.iov_len = 2 * pagesize;
io_uring_prep_readv(sqe, fd, &iovec, 1, 0);
ret = io_uring_submit_and_wait(&ring, 1);
if (ret != 1) {
fprintf(stderr,
"Failed at io_uring_submit_and_wait()\n");
return 1;
}
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
fprintf(stderr, "Failed at io_uring_wait_cqe()\n");
return 1;
}
printf("io_uring read result for file foo:\n\n");
printf(" cqe->res == %d (expected %d)\n", cqe->res, 2 * pagesize);
printf(" memcmp(read_buf, write_buf) == %d (expected 0)\n",
memcmp(read_buf, write_buf, 2 * pagesize));
io_uring_cqe_seen(&ring, cqe);
io_uring_queue_exit(&ring);
return 0;
}
When running it on an unpatched kernel:
$ gcc io_uring_test.c -luring
$ mkfs.btrfs -f /dev/sda
$ mount /dev/sda /mnt/sda
$ ./a.out /mnt/sda
io_uring read result for file foo:
cqe->res == 4096 (expected 8192)
memcmp(read_buf, write_buf) == -205 (expected 0)
After this patch, the read always returns 8192 bytes, with the buffer
filled with the correct data. Although that reproducer always triggers
the bug in my test vms, it's possible that it will not be so reliable
on other environments, as that can happen if the bio for the first
extent completes and decrements the reference on the struct iomap_dio
object before we do the atomic_dec_and_test() on the reference at
__iomap_dio_rw().
Fix this in btrfs by having btrfs_dio_iomap_begin() return -EAGAIN
whenever we try to satisfy a non blocking IO request (IOMAP_NOWAIT flag
set) over a range that spans multiple extents (or a mix of extents and
holes). This avoids returning success to the caller when we only did
partial IO, which is not optimal for writes and for reads it's actually
incorrect, as the caller doesn't expect to get less bytes read than it has
requested (unless EOF is crossed), as previously mentioned. This is also
the type of behaviour that xfs follows (xfs_direct_write_iomap_begin()),
even though it doesn't use IOMAP_DIO_PARTIAL.
A test case for fstests will follow soon.
Link: https://lore.kernel.org/linux-btrfs/CABVffEM0eEWho+206m470rtM0d9J8ue85TtR-A_oVTuGLWFicA@mail.gmail.com/
Link: https://lore.kernel.org/linux-btrfs/CAHF2GV6U32gmqSjLe=XKgfcZAmLCiH26cJ2OnHGp5x=VAH4OHQ@mail.gmail.com/
CC: stable@vger.kernel.org # 5.16+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-02 11:48:39 +00:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we have a NOWAIT request and the range contains multiple extents
|
|
|
|
|
* (or a mix of extents and holes), then we return -EAGAIN to make the
|
|
|
|
|
* caller fallback to a context where it can do a blocking (without
|
|
|
|
|
* NOWAIT) request. This way we avoid doing partial IO and returning
|
|
|
|
|
* success to the caller, which is not optimal for writes and for reads
|
|
|
|
|
* it can result in unexpected behaviour for an application.
|
|
|
|
|
*
|
|
|
|
|
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
|
|
|
|
|
* iomap_dio_rw(), we can end up returning less data then what the caller
|
|
|
|
|
* asked for, resulting in an unexpected, and incorrect, short read.
|
|
|
|
|
* That is, the caller asked to read N bytes and we return less than that,
|
|
|
|
|
* which is wrong unless we are crossing EOF. This happens if we get a
|
|
|
|
|
* page fault error when trying to fault in pages for the buffer that is
|
|
|
|
|
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
|
|
|
|
|
* have previously submitted bios for other extents in the range, in
|
|
|
|
|
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
|
|
|
|
|
* those bios have completed by the time we get the page fault error,
|
|
|
|
|
* which we return back to our caller - we should only return EIOCBQUEUED
|
|
|
|
|
* after we have submitted bios for all the extents in the range.
|
|
|
|
|
*/
|
|
|
|
|
if ((flags & IOMAP_NOWAIT) && len < length) {
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
ret = -EAGAIN;
|
|
|
|
|
goto unlock_err;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-17 11:18:21 -05:00
|
|
|
if (write) {
|
|
|
|
|
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
|
btrfs: avoid double nocow check when doing nowait dio writes
When doing a NOWAIT direct IO write we are checking twice if we can COW
into the target file range using can_nocow_extent() - once at the very
beginning of the write path, at btrfs_write_check() via
check_nocow_nolock(), and later again at btrfs_get_blocks_direct_write().
The can_nocow_extent() function does a lot of expensive things - searching
for the file extent item in the inode's subvolume tree, searching for the
extent item in the extent tree, checking delayed references, etc, so it
isn't a very cheap call.
We can remove the first check at btrfs_write_check(), and add there a
quick check to verify if the inode has the NODATACOW or PREALLOC flags,
and quickly bail out if it doesn't have neither of those flags, as that
means we have to COW and therefore can't comply with the NOWAIT semantics.
After this we do only one call to can_nocow_extent(), while we are at
btrfs_get_blocks_direct_write(), where we have already locked the file
range and we did a try lock on the range before, at
btrfs_dio_iomap_begin() (since the previous patch in the series).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:25 +00:00
|
|
|
start, len, flags);
|
2018-05-02 15:19:33 +03:00
|
|
|
if (ret < 0)
|
|
|
|
|
goto unlock_err;
|
2020-08-17 11:18:21 -05:00
|
|
|
unlock_extents = true;
|
|
|
|
|
/* Recalc len in case the new em is smaller than requested */
|
|
|
|
|
len = min(len, em->len - (start - em->start));
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
if (dio_data->data_space_reserved) {
|
|
|
|
|
u64 release_offset;
|
|
|
|
|
u64 release_len = 0;
|
|
|
|
|
|
|
|
|
|
if (dio_data->nocow_done) {
|
|
|
|
|
release_offset = start;
|
|
|
|
|
release_len = data_alloc_len;
|
|
|
|
|
} else if (len < data_alloc_len) {
|
|
|
|
|
release_offset = start + len;
|
|
|
|
|
release_len = data_alloc_len - len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (release_len > 0)
|
|
|
|
|
btrfs_free_reserved_data_space(BTRFS_I(inode),
|
|
|
|
|
dio_data->data_reserved,
|
|
|
|
|
release_offset,
|
|
|
|
|
release_len);
|
|
|
|
|
}
|
2018-05-02 15:19:33 +03:00
|
|
|
} else {
|
2018-05-02 15:19:32 +03:00
|
|
|
/*
|
|
|
|
|
* We need to unlock only the end area that we aren't using.
|
|
|
|
|
* The rest is going to be unlocked by the endio routine.
|
|
|
|
|
*/
|
2020-08-17 11:18:21 -05:00
|
|
|
lockstart = start + len;
|
|
|
|
|
if (lockstart < lockend)
|
|
|
|
|
unlock_extents = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (unlock_extents)
|
|
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
|
|
|
|
|
lockstart, lockend, &cached_state);
|
|
|
|
|
else
|
|
|
|
|
free_extent_state(cached_state);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Translate extent map information to iomap.
|
|
|
|
|
* We trim the extents (and move the addr) even though iomap code does
|
|
|
|
|
* that, since we have locked only the parts we are performing I/O in.
|
|
|
|
|
*/
|
|
|
|
|
if ((em->block_start == EXTENT_MAP_HOLE) ||
|
|
|
|
|
(test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
|
|
|
|
|
iomap->addr = IOMAP_NULL_ADDR;
|
|
|
|
|
iomap->type = IOMAP_HOLE;
|
|
|
|
|
} else {
|
|
|
|
|
iomap->addr = em->block_start + (start - em->start);
|
|
|
|
|
iomap->type = IOMAP_MAPPED;
|
2020-05-19 09:14:18 -05:00
|
|
|
}
|
2020-08-17 11:18:21 -05:00
|
|
|
iomap->offset = start;
|
2021-08-24 13:05:19 +08:00
|
|
|
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
|
2020-08-17 11:18:21 -05:00
|
|
|
iomap->length = len;
|
2020-05-19 09:14:18 -05:00
|
|
|
|
2021-05-19 00:40:27 +09:00
|
|
|
if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
|
2021-02-04 19:22:06 +09:00
|
|
|
iomap->flags |= IOMAP_F_ZONE_APPEND;
|
|
|
|
|
|
2010-05-23 11:00:55 -04:00
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
|
|
return 0;
|
2012-07-31 16:28:48 -04:00
|
|
|
|
|
|
|
|
unlock_err:
|
2019-08-15 14:04:04 -07:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
|
&cached_state);
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 09:52:04 +00:00
|
|
|
err:
|
btrfs: fix deadlock between concurrent dio writes when low on free data space
When reserving data space for a direct IO write we can end up deadlocking
if we have multiple tasks attempting a write to the same file range, there
are multiple extents covered by that file range, we are low on available
space for data and the writes don't expand the inode's i_size.
The deadlock can happen like this:
1) We have a file with an i_size of 1M, at offset 0 it has an extent with
a size of 128K and at offset 128K it has another extent also with a
size of 128K;
2) Task A does a direct IO write against file range [0, 256K), and because
the write is within the i_size boundary, it takes the inode's lock (VFS
level) in shared mode;
3) Task A locks the file range [0, 256K) at btrfs_dio_iomap_begin(), and
then gets the extent map for the extent covering the range [0, 128K).
At btrfs_get_blocks_direct_write(), it creates an ordered extent for
that file range ([0, 128K));
4) Before returning from btrfs_dio_iomap_begin(), it unlocks the file
range [0, 256K);
5) Task A executes btrfs_dio_iomap_begin() again, this time for the file
range [128K, 256K), and locks the file range [128K, 256K);
6) Task B starts a direct IO write against file range [0, 256K) as well.
It also locks the inode in shared mode, as it's within the i_size limit,
and then tries to lock file range [0, 256K). It is able to lock the
subrange [0, 128K) but then blocks waiting for the range [128K, 256K),
as it is currently locked by task A;
7) Task A enters btrfs_get_blocks_direct_write() and tries to reserve data
space. Because we are low on available free space, it triggers the
async data reclaim task, and waits for it to reserve data space;
8) The async reclaim task decides to wait for all existing ordered extents
to complete (through btrfs_wait_ordered_roots()).
It finds the ordered extent previously created by task A for the file
range [0, 128K) and waits for it to complete;
9) The ordered extent for the file range [0, 128K) can not complete
because it blocks at btrfs_finish_ordered_io() when trying to lock the
file range [0, 128K).
This results in a deadlock, because:
- task B is holding the file range [0, 128K) locked, waiting for the
range [128K, 256K) to be unlocked by task A;
- task A is holding the file range [128K, 256K) locked and it's waiting
for the async data reclaim task to satisfy its space reservation
request;
- the async data reclaim task is waiting for ordered extent [0, 128K)
to complete, but the ordered extent can not complete because the
file range [0, 128K) is currently locked by task B, which is waiting
on task A to unlock file range [128K, 256K) and task A waiting
on the async data reclaim task.
This results in a deadlock between 4 task: task A, task B, the async
data reclaim task and the task doing ordered extent completion (a work
queue task).
This type of deadlock can sporadically be triggered by the test case
generic/300 from fstests, and results in a stack trace like the following:
[12084.033689] INFO: task kworker/u16:7:123749 blocked for more than 241 seconds.
[12084.034877] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.035562] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.036548] task:kworker/u16:7 state:D stack: 0 pid:123749 ppid: 2 flags:0x00004000
[12084.036554] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[12084.036599] Call Trace:
[12084.036601] <TASK>
[12084.036606] __schedule+0x3cb/0xed0
[12084.036616] schedule+0x4e/0xb0
[12084.036620] btrfs_start_ordered_extent+0x109/0x1c0 [btrfs]
[12084.036651] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.036659] btrfs_run_ordered_extent_work+0x1a/0x30 [btrfs]
[12084.036688] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.036719] ? lock_is_held_type+0xe8/0x140
[12084.036727] process_one_work+0x252/0x5a0
[12084.036736] ? process_one_work+0x5a0/0x5a0
[12084.036738] worker_thread+0x52/0x3b0
[12084.036743] ? process_one_work+0x5a0/0x5a0
[12084.036745] kthread+0xf2/0x120
[12084.036747] ? kthread_complete_and_exit+0x20/0x20
[12084.036751] ret_from_fork+0x22/0x30
[12084.036765] </TASK>
[12084.036769] INFO: task kworker/u16:11:153787 blocked for more than 241 seconds.
[12084.037702] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.038540] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.039506] task:kworker/u16:11 state:D stack: 0 pid:153787 ppid: 2 flags:0x00004000
[12084.039511] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
[12084.039551] Call Trace:
[12084.039553] <TASK>
[12084.039557] __schedule+0x3cb/0xed0
[12084.039566] schedule+0x4e/0xb0
[12084.039569] schedule_timeout+0xed/0x130
[12084.039573] ? mark_held_locks+0x50/0x80
[12084.039578] ? _raw_spin_unlock_irq+0x24/0x50
[12084.039580] ? lockdep_hardirqs_on+0x7d/0x100
[12084.039585] __wait_for_common+0xaf/0x1f0
[12084.039587] ? usleep_range_state+0xb0/0xb0
[12084.039596] btrfs_wait_ordered_extents+0x3d6/0x470 [btrfs]
[12084.039636] btrfs_wait_ordered_roots+0x175/0x240 [btrfs]
[12084.039670] flush_space+0x25b/0x630 [btrfs]
[12084.039712] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
[12084.039747] process_one_work+0x252/0x5a0
[12084.039756] ? process_one_work+0x5a0/0x5a0
[12084.039758] worker_thread+0x52/0x3b0
[12084.039762] ? process_one_work+0x5a0/0x5a0
[12084.039765] kthread+0xf2/0x120
[12084.039766] ? kthread_complete_and_exit+0x20/0x20
[12084.039770] ret_from_fork+0x22/0x30
[12084.039783] </TASK>
[12084.039800] INFO: task kworker/u16:17:217907 blocked for more than 241 seconds.
[12084.040709] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.041398] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.042404] task:kworker/u16:17 state:D stack: 0 pid:217907 ppid: 2 flags:0x00004000
[12084.042411] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
[12084.042461] Call Trace:
[12084.042463] <TASK>
[12084.042471] __schedule+0x3cb/0xed0
[12084.042485] schedule+0x4e/0xb0
[12084.042490] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.042539] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.042551] lock_extent_bits+0x37/0x90 [btrfs]
[12084.042601] btrfs_finish_ordered_io.isra.0+0x3fd/0x960 [btrfs]
[12084.042656] ? lock_is_held_type+0xe8/0x140
[12084.042667] btrfs_work_helper+0xf8/0x400 [btrfs]
[12084.042716] ? lock_is_held_type+0xe8/0x140
[12084.042727] process_one_work+0x252/0x5a0
[12084.042742] worker_thread+0x52/0x3b0
[12084.042750] ? process_one_work+0x5a0/0x5a0
[12084.042754] kthread+0xf2/0x120
[12084.042757] ? kthread_complete_and_exit+0x20/0x20
[12084.042763] ret_from_fork+0x22/0x30
[12084.042783] </TASK>
[12084.042798] INFO: task fio:234517 blocked for more than 241 seconds.
[12084.043598] Not tainted 5.18.0-rc2-btrfs-next-115 #1
[12084.044282] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12084.045244] task:fio state:D stack: 0 pid:234517 ppid:234515 flags:0x00004000
[12084.045248] Call Trace:
[12084.045250] <TASK>
[12084.045254] __schedule+0x3cb/0xed0
[12084.045263] schedule+0x4e/0xb0
[12084.045266] wait_extent_bit.constprop.0+0x1eb/0x260 [btrfs]
[12084.045298] ? prepare_to_wait_exclusive+0xc0/0xc0
[12084.045306] lock_extent_bits+0x37/0x90 [btrfs]
[12084.045336] btrfs_dio_iomap_begin+0x336/0xc60 [btrfs]
[12084.045370] ? lock_is_held_type+0xe8/0x140
[12084.045378] iomap_iter+0x184/0x4c0
[12084.045383] __iomap_dio_rw+0x2c6/0x8a0
[12084.045406] iomap_dio_rw+0xa/0x30
[12084.045408] btrfs_do_write_iter+0x370/0x5e0 [btrfs]
[12084.045440] aio_write+0xfa/0x2c0
[12084.045448] ? __might_fault+0x2a/0x70
[12084.045451] ? kvm_sched_clock_read+0x14/0x40
[12084.045455] ? lock_release+0x153/0x4a0
[12084.045463] io_submit_one+0x615/0x9f0
[12084.045467] ? __might_fault+0x2a/0x70
[12084.045469] ? kvm_sched_clock_read+0x14/0x40
[12084.045478] __x64_sys_io_submit+0x83/0x160
[12084.045483] ? syscall_enter_from_user_mode+0x1d/0x50
[12084.045489] do_syscall_64+0x3b/0x90
[12084.045517] entry_SYSCALL_64_after_hwframe+0x44/0xae
[12084.045521] RIP: 0033:0x7fa76511af79
[12084.045525] RSP: 002b:00007ffd6d6b9058 EFLAGS: 00000246 ORIG_RAX: 00000000000000d1
[12084.045530] RAX: ffffffffffffffda RBX: 00007fa75ba6e760 RCX: 00007fa76511af79
[12084.045532] RDX: 0000557b304ff3f0 RSI: 0000000000000001 RDI: 00007fa75ba4c000
[12084.045535] RBP: 00007fa75ba4c000 R08: 00007fa751b76000 R09: 0000000000000330
[12084.045537] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
[12084.045540] R13: 0000000000000000 R14: 0000557b304ff3f0 R15: 0000557b30521eb0
[12084.045561] </TASK>
Fix this issue by always reserving data space before locking a file range
at btrfs_dio_iomap_begin(). If we can't reserve the space, then we don't
error out immediately - instead after locking the file range, check if we
can do a NOCOW write, and if we can we don't error out since we don't need
to allocate a data extent, however if we can't NOCOW then error out with
-ENOSPC. This also implies that we may end up reserving space when it's
not needed because the write will end up being done in NOCOW mode - in that
case we just release the space after we noticed we did a NOCOW write - this
is the same type of logic that is done in the path for buffered IO writes.
Fixes: f0bfa76a11e93d ("btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range")
CC: stable@vger.kernel.org # 5.17+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-28 14:59:46 +01:00
|
|
|
if (dio_data->data_space_reserved) {
|
|
|
|
|
btrfs_free_reserved_data_space(BTRFS_I(inode),
|
|
|
|
|
dio_data->data_reserved,
|
|
|
|
|
start, data_alloc_len);
|
|
|
|
|
extent_changeset_free(dio_data->data_reserved);
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-17 11:18:21 -05:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
|
|
|
|
|
ssize_t written, unsigned int flags, struct iomap *iomap)
|
|
|
|
|
{
|
2022-05-05 15:11:12 -05:00
|
|
|
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
|
|
|
|
|
struct btrfs_dio_data *dio_data = iter->private;
|
2020-08-17 11:18:21 -05:00
|
|
|
size_t submitted = dio_data->submitted;
|
|
|
|
|
const bool write = !!(flags & IOMAP_WRITE);
|
2022-05-05 15:11:12 -05:00
|
|
|
int ret = 0;
|
2020-08-17 11:18:21 -05:00
|
|
|
|
|
|
|
|
if (!write && (iomap->type == IOMAP_HOLE)) {
|
|
|
|
|
/* If reading from a hole, unlock and return */
|
|
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
|
2022-05-05 15:11:12 -05:00
|
|
|
return 0;
|
2020-08-17 11:18:21 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (submitted < length) {
|
|
|
|
|
pos += submitted;
|
|
|
|
|
length -= submitted;
|
|
|
|
|
if (write)
|
2022-06-19 08:07:05 +02:00
|
|
|
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
|
|
|
|
|
pos, length, false);
|
2020-08-17 11:18:21 -05:00
|
|
|
else
|
|
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
|
|
|
|
|
pos + length - 1);
|
|
|
|
|
ret = -ENOTBLK;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:
1) There are no data blocks groups with enough free space matching
the size of the write;
2) There's not enough unallocated space for allocating a new data block
group;
3) The extents in the target file range are not shared, neither through
snapshots nor through reflinks.
This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.
The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.
Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.
The following example test script reproduces the problem:
$ cat dio-nocow-enospc.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
# Use a small fixed size (1G) filesystem so that it's quick to fill
# it up.
# Make sure the mixed block groups feature is not enabled because we
# later want to not have more space available for allocating data
# extents but still have enough metadata space free for the file writes.
mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
mount $DEV $MNT
# Create our test file with the NOCOW attribute set.
touch $MNT/foobar
chattr +C $MNT/foobar
# Now fill in all unallocated space with data for our test file.
# This will allocate a data block group that will be full and leave
# no (or a very small amount of) unallocated space in the device, so
# that it will not be possible to allocate a new block group later.
echo
echo "Creating test file with initial data..."
xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
# Now try a direct IO write against file range [0, 10M[.
# This should succeed since this is a NOCOW file and an extent for the
# range was previously allocated.
echo
echo "Trying direct IO write over allocated space..."
xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
umount $MNT
When running the test:
$ ./dio-nocow-enospc.sh
(...)
Creating test file with initial data...
wrote 943718400/943718400 bytes at offset 0
900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
Trying direct IO write over allocated space...
pwrite: No space left on device
A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-10-28 16:03:41 +01:00
|
|
|
if (write)
|
2020-08-17 11:18:21 -05:00
|
|
|
extent_changeset_free(dio_data->data_reserved);
|
2014-09-12 18:44:03 +08:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-16 14:46:22 -07:00
|
|
|
static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
|
2014-09-12 18:44:03 +08:00
|
|
|
{
|
2020-04-16 14:46:22 -07:00
|
|
|
/*
|
|
|
|
|
* This implies a barrier so that stores to dio_bio->bi_status before
|
|
|
|
|
* this and loads of dio_bio->bi_status after this are fully ordered.
|
|
|
|
|
*/
|
|
|
|
|
if (!refcount_dec_and_test(&dip->refs))
|
|
|
|
|
return;
|
2014-09-12 18:44:03 +08:00
|
|
|
|
2022-05-05 15:11:15 -05:00
|
|
|
if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
|
2022-06-19 08:07:05 +02:00
|
|
|
btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
|
|
|
|
|
dip->file_offset, dip->bytes,
|
|
|
|
|
!dip->bio.bi_status);
|
2020-04-16 14:46:22 -07:00
|
|
|
} else {
|
|
|
|
|
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
|
2021-10-08 15:29:59 +08:00
|
|
|
dip->file_offset,
|
|
|
|
|
dip->file_offset + dip->bytes - 1);
|
2014-09-12 18:44:03 +08:00
|
|
|
}
|
|
|
|
|
|
2022-05-05 15:11:15 -05:00
|
|
|
kfree(dip->csums);
|
|
|
|
|
bio_endio(&dip->bio);
|
2014-09-12 18:44:03 +08:00
|
|
|
}
|
|
|
|
|
|
2022-04-15 16:33:28 +02:00
|
|
|
static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
|
2021-07-27 14:59:41 +02:00
|
|
|
int mirror_num,
|
|
|
|
|
enum btrfs_compression_type compress_type)
|
2014-09-12 18:44:03 +08:00
|
|
|
{
|
2022-08-06 10:03:26 +02:00
|
|
|
struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2014-09-12 18:44:03 +08:00
|
|
|
|
2016-06-05 14:31:52 -05:00
|
|
|
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
|
2014-09-12 18:44:03 +08:00
|
|
|
|
2020-04-16 14:46:25 -07:00
|
|
|
refcount_inc(&dip->refs);
|
2022-06-17 12:04:07 +02:00
|
|
|
btrfs_submit_bio(fs_info, bio, mirror_num);
|
2014-09-12 18:44:03 +08:00
|
|
|
}
|
|
|
|
|
|
2021-10-08 15:30:00 +08:00
|
|
|
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
|
2021-09-15 15:17:18 +08:00
|
|
|
struct btrfs_bio *bbio,
|
2020-04-16 14:46:23 -07:00
|
|
|
const bool uptodate)
|
2010-05-23 11:00:55 -04:00
|
|
|
{
|
2021-10-08 15:30:00 +08:00
|
|
|
struct inode *inode = dip->inode;
|
2020-04-16 14:46:23 -07:00
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
|
|
|
|
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
|
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
|
2017-08-22 23:45:59 -07:00
|
|
|
blk_status_t err = BLK_STS_OK;
|
2022-05-22 13:47:54 +02:00
|
|
|
struct bvec_iter iter;
|
|
|
|
|
struct bio_vec bv;
|
|
|
|
|
u32 offset;
|
|
|
|
|
|
|
|
|
|
btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
|
|
|
|
|
u64 start = bbio->file_offset + offset;
|
|
|
|
|
|
|
|
|
|
if (uptodate &&
|
2022-07-07 07:33:29 +02:00
|
|
|
(!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
|
|
|
|
|
bv.bv_offset))) {
|
2022-05-22 13:47:54 +02:00
|
|
|
clean_io_failure(fs_info, failure_tree, io_tree, start,
|
|
|
|
|
bv.bv_page, btrfs_ino(BTRFS_I(inode)),
|
|
|
|
|
bv.bv_offset);
|
|
|
|
|
} else {
|
|
|
|
|
int ret;
|
2010-05-23 11:00:55 -04:00
|
|
|
|
2022-07-07 07:33:28 +02:00
|
|
|
ret = btrfs_repair_one_sector(inode, bbio, offset,
|
|
|
|
|
bv.bv_page, bv.bv_offset,
|
2022-05-22 13:47:54 +02:00
|
|
|
submit_dio_repair_bio);
|
|
|
|
|
if (ret)
|
|
|
|
|
err = errno_to_blk_status(ret);
|
2016-01-21 15:55:55 +05:30
|
|
|
}
|
2013-11-07 12:20:26 -08:00
|
|
|
}
|
2014-09-12 18:43:56 +08:00
|
|
|
|
|
|
|
|
return err;
|
2015-11-24 16:23:54 +00:00
|
|
|
}
|
|
|
|
|
|
2020-10-21 14:24:53 +08:00
|
|
|
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
|
2020-12-02 14:47:57 +08:00
|
|
|
struct bio *bio,
|
|
|
|
|
u64 dio_file_offset)
|
2010-05-25 09:48:28 -04:00
|
|
|
{
|
2019-11-06 15:38:43 -08:00
|
|
|
return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
|
2010-05-25 09:48:28 -04:00
|
|
|
}
|
|
|
|
|
|
2022-08-06 10:03:26 +02:00
|
|
|
static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
|
2010-11-22 03:04:43 +00:00
|
|
|
{
|
2022-08-06 10:03:26 +02:00
|
|
|
struct btrfs_dio_private *dip = bbio->private;
|
|
|
|
|
struct bio *bio = &bbio->bio;
|
2017-06-03 09:38:06 +02:00
|
|
|
blk_status_t err = bio->bi_status;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
if (err)
|
|
|
|
|
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
|
2016-06-05 14:32:21 -05:00
|
|
|
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
|
2017-01-20 14:54:07 +01:00
|
|
|
btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
|
2020-11-26 15:41:27 +01:00
|
|
|
bio->bi_opf, bio->bi_iter.bi_sector,
|
2014-09-12 18:44:03 +08:00
|
|
|
bio->bi_iter.bi_size, err);
|
|
|
|
|
|
2021-10-08 15:30:00 +08:00
|
|
|
if (bio_op(bio) == REQ_OP_READ)
|
2022-03-24 17:06:28 +01:00
|
|
|
err = btrfs_check_read_dio_bio(dip, bbio, !err);
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2020-04-16 14:46:22 -07:00
|
|
|
if (err)
|
2022-05-05 15:11:15 -05:00
|
|
|
dip->bio.bi_status = err;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2022-03-24 17:06:28 +01:00
|
|
|
btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
|
2021-02-04 19:22:06 +09:00
|
|
|
|
2010-11-22 03:04:43 +00:00
|
|
|
bio_put(bio);
|
2020-04-16 14:46:22 -07:00
|
|
|
btrfs_dio_private_put(dip);
|
2014-09-12 18:43:56 +08:00
|
|
|
}
|
|
|
|
|
|
2022-06-17 12:04:13 +02:00
|
|
|
static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
|
|
|
|
|
u64 file_offset, int async_submit)
|
2010-11-22 03:04:43 +00:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2022-08-06 10:03:26 +02:00
|
|
|
struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
|
2017-06-03 09:38:06 +02:00
|
|
|
blk_status_t ret;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2022-07-07 07:33:30 +02:00
|
|
|
/* Save the original iter for read repair */
|
|
|
|
|
if (btrfs_op(bio) == BTRFS_MAP_READ)
|
|
|
|
|
btrfs_bio(bio)->iter = bio->bi_iter;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2017-08-03 15:44:58 +03:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
2011-04-06 14:41:34 -04:00
|
|
|
goto map;
|
|
|
|
|
|
2022-05-26 09:36:40 +02:00
|
|
|
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
|
2022-05-26 09:36:35 +02:00
|
|
|
/* Check btrfs_submit_data_write_bio() for async submit rules */
|
2022-06-17 12:04:12 +02:00
|
|
|
if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
|
|
|
|
|
btrfs_wq_submit_bio(inode, bio, 0, file_offset,
|
|
|
|
|
btrfs_submit_bio_start_direct_io))
|
2022-06-17 12:04:13 +02:00
|
|
|
return;
|
2022-06-17 12:04:12 +02:00
|
|
|
|
2011-04-06 14:41:34 -04:00
|
|
|
/*
|
|
|
|
|
* If we aren't doing async submit, calculate the csum of the
|
|
|
|
|
* bio now.
|
|
|
|
|
*/
|
2019-11-06 15:38:43 -08:00
|
|
|
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
|
2022-06-17 12:04:13 +02:00
|
|
|
if (ret) {
|
2022-08-06 10:03:26 +02:00
|
|
|
btrfs_bio_end_io(btrfs_bio(bio), ret);
|
2022-06-17 12:04:13 +02:00
|
|
|
return;
|
|
|
|
|
}
|
2014-09-12 18:43:54 +08:00
|
|
|
} else {
|
2022-05-22 13:47:52 +02:00
|
|
|
btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
|
|
|
|
|
file_offset - dip->file_offset);
|
2011-03-01 06:48:31 +00:00
|
|
|
}
|
2011-04-06 14:41:34 -04:00
|
|
|
map:
|
2022-06-17 12:04:07 +02:00
|
|
|
btrfs_submit_bio(fs_info, bio, 0);
|
2010-11-22 03:04:43 +00:00
|
|
|
}
|
|
|
|
|
|
2021-10-12 13:12:24 +02:00
|
|
|
static void btrfs_submit_direct(const struct iomap_iter *iter,
|
2020-08-17 11:18:21 -05:00
|
|
|
struct bio *dio_bio, loff_t file_offset)
|
2020-04-16 14:46:13 -07:00
|
|
|
{
|
2022-05-05 15:11:15 -05:00
|
|
|
struct btrfs_dio_private *dip =
|
|
|
|
|
container_of(dio_bio, struct btrfs_dio_private, bio);
|
2021-08-10 18:33:10 -07:00
|
|
|
struct inode *inode = iter->inode;
|
2021-02-04 19:21:59 +09:00
|
|
|
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2020-04-16 14:46:22 -07:00
|
|
|
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
|
|
|
|
|
BTRFS_BLOCK_GROUP_RAID56_MASK);
|
2010-11-22 03:04:43 +00:00
|
|
|
struct bio *bio;
|
2020-04-16 14:46:13 -07:00
|
|
|
u64 start_sector;
|
2011-04-06 14:41:34 -04:00
|
|
|
int async_submit = 0;
|
2017-05-16 09:51:39 -07:00
|
|
|
u64 submit_len;
|
2021-07-21 21:43:34 +09:00
|
|
|
u64 clone_offset = 0;
|
|
|
|
|
u64 clone_len;
|
2021-01-27 14:57:27 +01:00
|
|
|
u64 logical;
|
2016-01-21 15:56:00 +05:30
|
|
|
int ret;
|
2017-08-22 23:45:59 -07:00
|
|
|
blk_status_t status;
|
2019-06-03 12:05:05 +03:00
|
|
|
struct btrfs_io_geometry geom;
|
2022-05-05 15:11:12 -05:00
|
|
|
struct btrfs_dio_data *dio_data = iter->private;
|
2021-01-27 14:57:27 +01:00
|
|
|
struct extent_map *em = NULL;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2022-05-05 15:11:15 -05:00
|
|
|
dip->inode = inode;
|
|
|
|
|
dip->file_offset = file_offset;
|
|
|
|
|
dip->bytes = dio_bio->bi_iter.bi_size;
|
|
|
|
|
refcount_set(&dip->refs, 1);
|
|
|
|
|
dip->csums = NULL;
|
|
|
|
|
|
|
|
|
|
if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
|
|
|
|
|
unsigned int nr_sectors =
|
|
|
|
|
(dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
|
2013-07-25 19:22:34 +08:00
|
|
|
|
2020-04-16 14:46:21 -07:00
|
|
|
/*
|
|
|
|
|
* Load the csums up front to reduce csum tree searches and
|
|
|
|
|
* contention when submitting bios.
|
|
|
|
|
*/
|
2022-05-05 15:11:15 -05:00
|
|
|
status = BLK_STS_RESOURCE;
|
|
|
|
|
dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
|
|
|
|
|
if (!dip)
|
|
|
|
|
goto out_err;
|
|
|
|
|
|
btrfs: refactor btrfs_lookup_bio_sums to handle out-of-order bvecs
Refactor btrfs_lookup_bio_sums() by:
- Remove the @file_offset parameter
There are two factors making the @file_offset parameter useless:
* For csum lookup in csum tree, file offset makes no sense
We only need disk_bytenr, which is unrelated to file_offset
* page_offset (file offset) of each bvec is not contiguous.
Pages can be added to the same bio as long as their on-disk bytenr
is contiguous, meaning we could have pages at different file offsets
in the same bio.
Thus passing file_offset makes no sense any more.
The only user of file_offset is for data reloc inode, we will use
a new function, search_file_offset_in_bio(), to handle it.
- Extract the csum tree lookup into search_csum_tree()
The new function will handle the csum search in csum tree.
The return value is the same as btrfs_find_ordered_sum(), returning
the number of found sectors which have checksum.
- Change how we do the main loop
The only needed info from bio is:
* the on-disk bytenr
* the length
After extracting the above info, we can do the search without bio
at all, which makes the main loop much simpler:
for (cur_disk_bytenr = orig_disk_bytenr;
cur_disk_bytenr < orig_disk_bytenr + orig_len;
cur_disk_bytenr += count * sectorsize) {
/* Lookup csum tree */
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
search_len, csum_dst);
if (!count) {
/* Csum hole handling */
}
}
- Use single variable as the source to calculate all other offsets
Instead of all different type of variables, we use only one main
variable, cur_disk_bytenr, which represents the current disk bytenr.
All involved values can be calculated from that variable, and
all those variable will only be visible in the inner loop.
The above refactoring makes btrfs_lookup_bio_sums() way more robust than
it used to be, especially related to the file offset lookup. Now
file_offset lookup is only related to data reloc inode, otherwise we
don't need to bother file_offset at all.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-02 14:48:06 +08:00
|
|
|
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
|
2020-04-16 14:46:21 -07:00
|
|
|
if (status != BLK_STS_OK)
|
|
|
|
|
goto out_err;
|
2011-04-06 14:25:44 -04:00
|
|
|
}
|
|
|
|
|
|
2020-04-16 14:46:22 -07:00
|
|
|
start_sector = dio_bio->bi_iter.bi_sector;
|
|
|
|
|
submit_len = dio_bio->bi_iter.bi_size;
|
2013-01-29 18:40:14 -05:00
|
|
|
|
2017-05-18 15:33:29 +02:00
|
|
|
do {
|
2021-01-27 14:57:27 +01:00
|
|
|
logical = start_sector << 9;
|
|
|
|
|
em = btrfs_get_chunk_map(fs_info, logical, submit_len);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
status = errno_to_blk_status(PTR_ERR(em));
|
|
|
|
|
em = NULL;
|
|
|
|
|
goto out_err_em;
|
|
|
|
|
}
|
|
|
|
|
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
|
2021-04-13 17:58:48 +08:00
|
|
|
logical, &geom);
|
2020-04-16 14:46:22 -07:00
|
|
|
if (ret) {
|
|
|
|
|
status = errno_to_blk_status(ret);
|
2021-01-27 14:57:27 +01:00
|
|
|
goto out_err_em;
|
2020-04-16 14:46:22 -07:00
|
|
|
}
|
|
|
|
|
|
2021-07-21 21:43:34 +09:00
|
|
|
clone_len = min(submit_len, geom.len);
|
|
|
|
|
ASSERT(clone_len <= UINT_MAX);
|
2011-04-06 14:25:44 -04:00
|
|
|
|
2017-05-16 09:51:39 -07:00
|
|
|
/*
|
|
|
|
|
* This will never fail as it's passing GPF_NOFS and
|
|
|
|
|
* the allocation is backed by btrfs_bioset.
|
|
|
|
|
*/
|
2022-08-06 10:03:26 +02:00
|
|
|
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
|
|
|
|
|
btrfs_end_dio_bio, dip);
|
2022-03-24 17:06:27 +01:00
|
|
|
btrfs_bio(bio)->file_offset = file_offset;
|
2017-05-16 09:51:39 -07:00
|
|
|
|
2021-02-04 19:22:06 +09:00
|
|
|
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
|
|
|
|
status = extract_ordered_extent(BTRFS_I(inode), bio,
|
|
|
|
|
file_offset);
|
|
|
|
|
if (status) {
|
|
|
|
|
bio_put(bio);
|
|
|
|
|
goto out_err;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-05-16 09:51:39 -07:00
|
|
|
ASSERT(submit_len >= clone_len);
|
|
|
|
|
submit_len -= clone_len;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2017-05-16 09:51:39 -07:00
|
|
|
/*
|
|
|
|
|
* Increase the count before we submit the bio so we know
|
|
|
|
|
* the end IO handler won't happen before we increase the
|
|
|
|
|
* count. Otherwise, the dip might get freed before we're
|
|
|
|
|
* done setting it up.
|
2020-04-16 14:46:22 -07:00
|
|
|
*
|
|
|
|
|
* We transfer the initial reference to the last bio, so we
|
|
|
|
|
* don't need to increment the reference count for the last one.
|
2017-05-16 09:51:39 -07:00
|
|
|
*/
|
2020-04-16 14:46:22 -07:00
|
|
|
if (submit_len > 0) {
|
|
|
|
|
refcount_inc(&dip->refs);
|
|
|
|
|
/*
|
|
|
|
|
* If we are submitting more than one bio, submit them
|
|
|
|
|
* all asynchronously. The exception is RAID 5 or 6, as
|
|
|
|
|
* asynchronous checksums make it difficult to collect
|
|
|
|
|
* full stripe writes.
|
|
|
|
|
*/
|
|
|
|
|
if (!raid56)
|
|
|
|
|
async_submit = 1;
|
|
|
|
|
}
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2022-06-17 12:04:13 +02:00
|
|
|
btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2020-08-17 11:18:21 -05:00
|
|
|
dio_data->submitted += clone_len;
|
2017-05-16 09:51:39 -07:00
|
|
|
clone_offset += clone_len;
|
|
|
|
|
start_sector += clone_len >> 9;
|
|
|
|
|
file_offset += clone_len;
|
2021-01-27 14:57:27 +01:00
|
|
|
|
|
|
|
|
free_extent_map(em);
|
2017-05-18 15:33:29 +02:00
|
|
|
} while (submit_len > 0);
|
2021-10-12 13:12:24 +02:00
|
|
|
return;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2021-01-27 14:57:27 +01:00
|
|
|
out_err_em:
|
|
|
|
|
free_extent_map(em);
|
2010-11-22 03:04:43 +00:00
|
|
|
out_err:
|
2022-05-05 15:11:15 -05:00
|
|
|
dio_bio->bi_status = status;
|
2020-04-16 14:46:22 -07:00
|
|
|
btrfs_dio_private_put(dip);
|
2010-05-23 11:00:55 -04:00
|
|
|
}
|
|
|
|
|
|
2022-05-05 15:11:09 -05:00
|
|
|
static const struct iomap_ops btrfs_dio_iomap_ops = {
|
2020-08-17 11:18:21 -05:00
|
|
|
.iomap_begin = btrfs_dio_iomap_begin,
|
|
|
|
|
.iomap_end = btrfs_dio_iomap_end,
|
|
|
|
|
};
|
|
|
|
|
|
2022-05-05 15:11:09 -05:00
|
|
|
static const struct iomap_dio_ops btrfs_dio_ops = {
|
2020-08-17 11:18:21 -05:00
|
|
|
.submit_io = btrfs_submit_direct,
|
2022-05-05 15:11:15 -05:00
|
|
|
.bio_set = &btrfs_dio_bioset,
|
2020-08-17 11:18:21 -05:00
|
|
|
};
|
|
|
|
|
|
2022-05-05 15:11:09 -05:00
|
|
|
ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
|
|
|
|
|
{
|
2022-05-05 15:11:12 -05:00
|
|
|
struct btrfs_dio_data data;
|
|
|
|
|
|
2022-05-05 15:11:09 -05:00
|
|
|
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
2022-06-07 16:10:01 -04:00
|
|
|
IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
|
|
|
|
|
&data, done_before);
|
2022-05-05 15:11:09 -05:00
|
|
|
}
|
|
|
|
|
|
2009-01-21 14:39:14 -05:00
|
|
|
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
2020-06-23 20:56:12 +02:00
|
|
|
u64 start, u64 len)
|
2009-01-21 14:39:14 -05:00
|
|
|
{
|
2012-11-29 05:08:26 +00:00
|
|
|
int ret;
|
|
|
|
|
|
2020-05-23 09:30:14 +02:00
|
|
|
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
|
2012-11-29 05:08:26 +00:00
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
2022-09-01 14:18:25 +01:00
|
|
|
/*
|
|
|
|
|
* fiemap_prep() called filemap_write_and_wait() for the whole possible
|
|
|
|
|
* file range (0 to LLONG_MAX), but that is not enough if we have
|
|
|
|
|
* compression enabled. The first filemap_fdatawrite_range() only kicks
|
|
|
|
|
* in the compression of data (in an async thread) and will return
|
|
|
|
|
* before the compression is done and writeback is started. A second
|
|
|
|
|
* filemap_fdatawrite_range() is needed to wait for the compression to
|
btrfs: make fiemap more efficient and accurate reporting extent sharedness
The current fiemap implementation does not scale very well with the number
of extents a file has. This is both because the main algorithm to find out
the extents has a high algorithmic complexity and because for each extent
we have to check if it's shared. This second part, checking if an extent
is shared, is significantly improved by the two previous patches in this
patchset, while the first part is improved by this specific patch. Every
now and then we get reports from users mentioning fiemap is too slow or
even unusable for files with a very large number of extents, such as the
two recent reports referred to by the Link tags at the bottom of this
change log.
To understand why the part of finding which extents a file has is very
inefficient, consider the example of doing a full ranged fiemap against
a file that has over 100K extents (normal for example for a file with
more than 10G of data and using compression, which limits the extent size
to 128K). When we enter fiemap at extent_fiemap(), the following happens:
1) Before entering the main loop, we call get_extent_skip_holes() to get
the first extent map. This leads us to btrfs_get_extent_fiemap(), which
in turn calls btrfs_get_extent(), to find the first extent map that
covers the file range [0, LLONG_MAX).
btrfs_get_extent() will first search the inode's extent map tree, to
see if we have an extent map there that covers the range. If it does
not find one, then it will search the inode's subvolume b+tree for a
fitting file extent item. After finding the file extent item, it will
allocate an extent map, fill it in with information extracted from the
file extent item, and add it to the inode's extent map tree (which
requires a search for insertion in the tree).
2) Then we enter the main loop at extent_fiemap(), emit the details of
the extent, and call again get_extent_skip_holes(), with a start
offset matching the end of the extent map we previously processed.
We end up at btrfs_get_extent() again, will search the extent map tree
and then search the subvolume b+tree for a file extent item if we could
not find an extent map in the extent tree. We allocate an extent map,
fill it in with the details in the file extent item, and then insert
it into the extent map tree (yet another search in this tree).
3) The second step is repeated over and over, until we have processed the
whole file range. Each iteration ends at btrfs_get_extent(), which
does a red black tree search on the extent map tree, then searches the
subvolume b+tree, allocates an extent map and then does another search
in the extent map tree in order to insert the extent map.
In the best scenario we have all the extent maps already in the extent
tree, and so for each extent we do a single search on a red black tree,
so we have a complexity of O(n log n).
In the worst scenario we don't have any extent map already loaded in
the extent map tree, or have very few already there. In this case the
complexity is much higher since we do:
- A red black tree search on the extent map tree, which has O(log n)
complexity, initially very fast since the tree is empty or very
small, but as we end up allocating extent maps and adding them to
the tree when we don't find them there, each subsequent search on
the tree gets slower, since it's getting bigger and bigger after
each iteration.
- A search on the subvolume b+tree, also O(log n) complexity, but it
has items for all inodes in the subvolume, not just items for our
inode. Plus on a filesystem with concurrent operations on other
inodes, we can block doing the search due to lock contention on
b+tree nodes/leaves.
- Allocate an extent map - this can block, and can also fail if we
are under serious memory pressure.
- Do another search on the extent maps red black tree, with the goal
of inserting the extent map we just allocated. Again, after every
iteration this tree is getting bigger by 1 element, so after many
iterations the searches are slower and slower.
- We will not need the allocated extent map anymore, so it's pointless
to add it to the extent map tree. It's just wasting time and memory.
In short we end up searching the extent map tree multiple times, on a
tree that is growing bigger and bigger after each iteration. And
besides that we visit the same leaf of the subvolume b+tree many times,
since a leaf with the default size of 16K can easily have more than 200
file extent items.
This is very inefficient overall. This patch changes the algorithm to
instead iterate over the subvolume b+tree, visiting each leaf only once,
and only searching in the extent map tree for file ranges that have holes
or prealloc extents, in order to figure out if we have delalloc there.
It will never allocate an extent map and add it to the extent map tree.
This is very similar to what was previously done for the lseek's hole and
data seeking features.
Also, the current implementation relying on extent maps for figuring out
which extents we have is not correct. This is because extent maps can be
merged even if they represent different extents - we do this to minimize
memory utilization and keep extent map trees smaller. For example if we
have two extents that are contiguous on disk, once we load the two extent
maps, they get merged into a single one - however if only one of the
extents is shared, we end up reporting both as shared or both as not
shared, which is incorrect.
This reproducer triggers that bug:
$ cat fiemap-bug.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
mkfs.btrfs -f $DEV
mount $DEV $MNT
# Create a file with two 256K extents.
# Since there is no other write activity, they will be contiguous,
# and their extent maps merged, despite having two distinct extents.
xfs_io -f -c "pwrite -S 0xab 0 256K" \
-c "fsync" \
-c "pwrite -S 0xcd 256K 256K" \
-c "fsync" \
$MNT/foo
# Now clone only the second extent into another file.
xfs_io -f -c "reflink $MNT/foo 256K 0 256K" $MNT/bar
# Filefrag will report a single 512K extent, and say it's not shared.
echo
filefrag -v $MNT/foo
umount $MNT
Running the reproducer:
$ ./fiemap-bug.sh
wrote 262144/262144 bytes at offset 0
256 KiB, 64 ops; 0.0038 sec (65.479 MiB/sec and 16762.7030 ops/sec)
wrote 262144/262144 bytes at offset 262144
256 KiB, 64 ops; 0.0040 sec (61.125 MiB/sec and 15647.9218 ops/sec)
linked 262144/262144 bytes at offset 0
256 KiB, 1 ops; 0.0002 sec (1.034 GiB/sec and 4237.2881 ops/sec)
Filesystem type is: 9123683e
File size of /mnt/sdj/foo is 524288 (128 blocks of 4096 bytes)
ext: logical_offset: physical_offset: length: expected: flags:
0: 0.. 127: 3328.. 3455: 128: last,eof
/mnt/sdj/foo: 1 extent found
We end up reporting that we have a single 512K that is not shared, however
we have two 256K extents, and the second one is shared. Changing the
reproducer to clone instead the first extent into file 'bar', makes us
report a single 512K extent that is shared, which is algo incorrect since
we have two 256K extents and only the first one is shared.
This patch is part of a larger patchset that is comprised of the following
patches:
btrfs: allow hole and data seeking to be interruptible
btrfs: make hole and data seeking a lot more efficient
btrfs: remove check for impossible block start for an extent map at fiemap
btrfs: remove zero length check when entering fiemap
btrfs: properly flush delalloc when entering fiemap
btrfs: allow fiemap to be interruptible
btrfs: rename btrfs_check_shared() to a more descriptive name
btrfs: speedup checking for extent sharedness during fiemap
btrfs: skip unnecessary extent buffer sharedness checks during fiemap
btrfs: make fiemap more efficient and accurate reporting extent sharedness
The patchset was tested on a machine running a non-debug kernel (Debian's
default config) and compared the tests below on a branch without the
patchset versus the same branch with the whole patchset applied.
The following test for a large compressed file without holes:
$ cat fiemap-perf-test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f $DEV
mount -o compress=lzo $DEV $MNT
# 40G gives 327680 128K file extents (due to compression).
xfs_io -f -c "pwrite -S 0xab -b 1M 0 20G" $MNT/foobar
umount $MNT
mount -o compress=lzo $DEV $MNT
start=$(date +%s%N)
filefrag $MNT/foobar
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "fiemap took $dur milliseconds (metadata not cached)"
start=$(date +%s%N)
filefrag $MNT/foobar
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "fiemap took $dur milliseconds (metadata cached)"
umount $MNT
Before patchset:
$ ./fiemap-perf-test.sh
(...)
/mnt/sdi/foobar: 327680 extents found
fiemap took 3597 milliseconds (metadata not cached)
/mnt/sdi/foobar: 327680 extents found
fiemap took 2107 milliseconds (metadata cached)
After patchset:
$ ./fiemap-perf-test.sh
(...)
/mnt/sdi/foobar: 327680 extents found
fiemap took 1214 milliseconds (metadata not cached)
/mnt/sdi/foobar: 327680 extents found
fiemap took 684 milliseconds (metadata cached)
That's a speedup of about 3x for both cases (no metadata cached and all
metadata cached).
The test provided by Pavel (first Link tag at the bottom), which uses
files with a large number of holes, was also used to measure the gains,
and it consists on a small C program and a shell script to invoke it.
The C program is the following:
$ cat pavels-test.c
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <linux/fiemap.h>
#define FILE_INTERVAL (1<<13) /* 8Kb */
long long interval(struct timeval t1, struct timeval t2)
{
long long val = 0;
val += (t2.tv_usec - t1.tv_usec);
val += (t2.tv_sec - t1.tv_sec) * 1000 * 1000;
return val;
}
int main(int argc, char **argv)
{
struct fiemap fiemap = {};
struct timeval t1, t2;
char data = 'a';
struct stat st;
int fd, off, file_size = FILE_INTERVAL;
if (argc != 3 && argc != 2) {
printf("usage: %s <path> [size]\n", argv[0]);
return 1;
}
if (argc == 3)
file_size = atoi(argv[2]);
if (file_size < FILE_INTERVAL)
file_size = FILE_INTERVAL;
file_size -= file_size % FILE_INTERVAL;
fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
perror("open");
return 1;
}
for (off = 0; off < file_size; off += FILE_INTERVAL) {
if (pwrite(fd, &data, 1, off) != 1) {
perror("pwrite");
close(fd);
return 1;
}
}
if (ftruncate(fd, file_size)) {
perror("ftruncate");
close(fd);
return 1;
}
if (fstat(fd, &st) < 0) {
perror("fstat");
close(fd);
return 1;
}
printf("size: %ld\n", st.st_size);
printf("actual size: %ld\n", st.st_blocks * 512);
fiemap.fm_length = FIEMAP_MAX_OFFSET;
gettimeofday(&t1, NULL);
if (ioctl(fd, FS_IOC_FIEMAP, &fiemap) < 0) {
perror("fiemap");
close(fd);
return 1;
}
gettimeofday(&t2, NULL);
printf("fiemap: fm_mapped_extents = %d\n",
fiemap.fm_mapped_extents);
printf("time = %lld us\n", interval(t1, t2));
close(fd);
return 0;
}
$ gcc -o pavels_test pavels_test.c
And the wrapper shell script:
$ cat fiemap-pavels-test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f -O no-holes $DEV
mount $DEV $MNT
echo
echo "*********** 256M ***********"
echo
./pavels-test $MNT/testfile $((1 << 28))
echo
./pavels-test $MNT/testfile $((1 << 28))
echo
echo "*********** 512M ***********"
echo
./pavels-test $MNT/testfile $((1 << 29))
echo
./pavels-test $MNT/testfile $((1 << 29))
echo
echo "*********** 1G ***********"
echo
./pavels-test $MNT/testfile $((1 << 30))
echo
./pavels-test $MNT/testfile $((1 << 30))
umount $MNT
Running his reproducer before applying the patchset:
*********** 256M ***********
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 4003133 us
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 4895330 us
*********** 512M ***********
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 30123675 us
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 33450934 us
*********** 1G ***********
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 224924074 us
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 217239242 us
Running it after applying the patchset:
*********** 256M ***********
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 29475 us
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 29307 us
*********** 512M ***********
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 58996 us
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 59115 us
*********** 1G ***********
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 116251
time = 124141 us
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 119387 us
The speedup is massive, both on the first fiemap call and on the second
one as well, as his test creates files with many holes and small extents
(every extent follows a hole and precedes another hole).
For the 256M file we go from 4 seconds down to 29 milliseconds in the
first run, and then from 4.9 seconds down to 29 milliseconds again in the
second run, a speedup of 138x and 169x, respectively.
For the 512M file we go from 30.1 seconds down to 59 milliseconds in the
first run, and then from 33.5 seconds down to 59 milliseconds again in the
second run, a speedup of 510x and 568x, respectively.
For the 1G file, we go from 225 seconds down to 124 milliseconds in the
first run, and then from 217 seconds down to 119 milliseconds in the
second run, a speedup of 1815x and 1824x, respectively.
Reported-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Link: https://lore.kernel.org/linux-btrfs/21dd32c6-f1f9-f44a-466a-e18fdc6788a7@virtuozzo.com/
Reported-by: Dominique MARTINET <dominique.martinet@atmark-techno.com>
Link: https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-01 14:18:30 +01:00
|
|
|
* complete and writeback to start. We also need to wait for ordered
|
|
|
|
|
* extents to complete, because our fiemap implementation uses mainly
|
|
|
|
|
* file extent items to list the extents, searching for extent maps
|
|
|
|
|
* only for file ranges with holes or prealloc extents to figure out
|
|
|
|
|
* if we have delalloc in those ranges.
|
2022-09-01 14:18:25 +01:00
|
|
|
*/
|
|
|
|
|
if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
|
btrfs: make fiemap more efficient and accurate reporting extent sharedness
The current fiemap implementation does not scale very well with the number
of extents a file has. This is both because the main algorithm to find out
the extents has a high algorithmic complexity and because for each extent
we have to check if it's shared. This second part, checking if an extent
is shared, is significantly improved by the two previous patches in this
patchset, while the first part is improved by this specific patch. Every
now and then we get reports from users mentioning fiemap is too slow or
even unusable for files with a very large number of extents, such as the
two recent reports referred to by the Link tags at the bottom of this
change log.
To understand why the part of finding which extents a file has is very
inefficient, consider the example of doing a full ranged fiemap against
a file that has over 100K extents (normal for example for a file with
more than 10G of data and using compression, which limits the extent size
to 128K). When we enter fiemap at extent_fiemap(), the following happens:
1) Before entering the main loop, we call get_extent_skip_holes() to get
the first extent map. This leads us to btrfs_get_extent_fiemap(), which
in turn calls btrfs_get_extent(), to find the first extent map that
covers the file range [0, LLONG_MAX).
btrfs_get_extent() will first search the inode's extent map tree, to
see if we have an extent map there that covers the range. If it does
not find one, then it will search the inode's subvolume b+tree for a
fitting file extent item. After finding the file extent item, it will
allocate an extent map, fill it in with information extracted from the
file extent item, and add it to the inode's extent map tree (which
requires a search for insertion in the tree).
2) Then we enter the main loop at extent_fiemap(), emit the details of
the extent, and call again get_extent_skip_holes(), with a start
offset matching the end of the extent map we previously processed.
We end up at btrfs_get_extent() again, will search the extent map tree
and then search the subvolume b+tree for a file extent item if we could
not find an extent map in the extent tree. We allocate an extent map,
fill it in with the details in the file extent item, and then insert
it into the extent map tree (yet another search in this tree).
3) The second step is repeated over and over, until we have processed the
whole file range. Each iteration ends at btrfs_get_extent(), which
does a red black tree search on the extent map tree, then searches the
subvolume b+tree, allocates an extent map and then does another search
in the extent map tree in order to insert the extent map.
In the best scenario we have all the extent maps already in the extent
tree, and so for each extent we do a single search on a red black tree,
so we have a complexity of O(n log n).
In the worst scenario we don't have any extent map already loaded in
the extent map tree, or have very few already there. In this case the
complexity is much higher since we do:
- A red black tree search on the extent map tree, which has O(log n)
complexity, initially very fast since the tree is empty or very
small, but as we end up allocating extent maps and adding them to
the tree when we don't find them there, each subsequent search on
the tree gets slower, since it's getting bigger and bigger after
each iteration.
- A search on the subvolume b+tree, also O(log n) complexity, but it
has items for all inodes in the subvolume, not just items for our
inode. Plus on a filesystem with concurrent operations on other
inodes, we can block doing the search due to lock contention on
b+tree nodes/leaves.
- Allocate an extent map - this can block, and can also fail if we
are under serious memory pressure.
- Do another search on the extent maps red black tree, with the goal
of inserting the extent map we just allocated. Again, after every
iteration this tree is getting bigger by 1 element, so after many
iterations the searches are slower and slower.
- We will not need the allocated extent map anymore, so it's pointless
to add it to the extent map tree. It's just wasting time and memory.
In short we end up searching the extent map tree multiple times, on a
tree that is growing bigger and bigger after each iteration. And
besides that we visit the same leaf of the subvolume b+tree many times,
since a leaf with the default size of 16K can easily have more than 200
file extent items.
This is very inefficient overall. This patch changes the algorithm to
instead iterate over the subvolume b+tree, visiting each leaf only once,
and only searching in the extent map tree for file ranges that have holes
or prealloc extents, in order to figure out if we have delalloc there.
It will never allocate an extent map and add it to the extent map tree.
This is very similar to what was previously done for the lseek's hole and
data seeking features.
Also, the current implementation relying on extent maps for figuring out
which extents we have is not correct. This is because extent maps can be
merged even if they represent different extents - we do this to minimize
memory utilization and keep extent map trees smaller. For example if we
have two extents that are contiguous on disk, once we load the two extent
maps, they get merged into a single one - however if only one of the
extents is shared, we end up reporting both as shared or both as not
shared, which is incorrect.
This reproducer triggers that bug:
$ cat fiemap-bug.sh
#!/bin/bash
DEV=/dev/sdj
MNT=/mnt/sdj
mkfs.btrfs -f $DEV
mount $DEV $MNT
# Create a file with two 256K extents.
# Since there is no other write activity, they will be contiguous,
# and their extent maps merged, despite having two distinct extents.
xfs_io -f -c "pwrite -S 0xab 0 256K" \
-c "fsync" \
-c "pwrite -S 0xcd 256K 256K" \
-c "fsync" \
$MNT/foo
# Now clone only the second extent into another file.
xfs_io -f -c "reflink $MNT/foo 256K 0 256K" $MNT/bar
# Filefrag will report a single 512K extent, and say it's not shared.
echo
filefrag -v $MNT/foo
umount $MNT
Running the reproducer:
$ ./fiemap-bug.sh
wrote 262144/262144 bytes at offset 0
256 KiB, 64 ops; 0.0038 sec (65.479 MiB/sec and 16762.7030 ops/sec)
wrote 262144/262144 bytes at offset 262144
256 KiB, 64 ops; 0.0040 sec (61.125 MiB/sec and 15647.9218 ops/sec)
linked 262144/262144 bytes at offset 0
256 KiB, 1 ops; 0.0002 sec (1.034 GiB/sec and 4237.2881 ops/sec)
Filesystem type is: 9123683e
File size of /mnt/sdj/foo is 524288 (128 blocks of 4096 bytes)
ext: logical_offset: physical_offset: length: expected: flags:
0: 0.. 127: 3328.. 3455: 128: last,eof
/mnt/sdj/foo: 1 extent found
We end up reporting that we have a single 512K that is not shared, however
we have two 256K extents, and the second one is shared. Changing the
reproducer to clone instead the first extent into file 'bar', makes us
report a single 512K extent that is shared, which is algo incorrect since
we have two 256K extents and only the first one is shared.
This patch is part of a larger patchset that is comprised of the following
patches:
btrfs: allow hole and data seeking to be interruptible
btrfs: make hole and data seeking a lot more efficient
btrfs: remove check for impossible block start for an extent map at fiemap
btrfs: remove zero length check when entering fiemap
btrfs: properly flush delalloc when entering fiemap
btrfs: allow fiemap to be interruptible
btrfs: rename btrfs_check_shared() to a more descriptive name
btrfs: speedup checking for extent sharedness during fiemap
btrfs: skip unnecessary extent buffer sharedness checks during fiemap
btrfs: make fiemap more efficient and accurate reporting extent sharedness
The patchset was tested on a machine running a non-debug kernel (Debian's
default config) and compared the tests below on a branch without the
patchset versus the same branch with the whole patchset applied.
The following test for a large compressed file without holes:
$ cat fiemap-perf-test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f $DEV
mount -o compress=lzo $DEV $MNT
# 40G gives 327680 128K file extents (due to compression).
xfs_io -f -c "pwrite -S 0xab -b 1M 0 20G" $MNT/foobar
umount $MNT
mount -o compress=lzo $DEV $MNT
start=$(date +%s%N)
filefrag $MNT/foobar
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "fiemap took $dur milliseconds (metadata not cached)"
start=$(date +%s%N)
filefrag $MNT/foobar
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "fiemap took $dur milliseconds (metadata cached)"
umount $MNT
Before patchset:
$ ./fiemap-perf-test.sh
(...)
/mnt/sdi/foobar: 327680 extents found
fiemap took 3597 milliseconds (metadata not cached)
/mnt/sdi/foobar: 327680 extents found
fiemap took 2107 milliseconds (metadata cached)
After patchset:
$ ./fiemap-perf-test.sh
(...)
/mnt/sdi/foobar: 327680 extents found
fiemap took 1214 milliseconds (metadata not cached)
/mnt/sdi/foobar: 327680 extents found
fiemap took 684 milliseconds (metadata cached)
That's a speedup of about 3x for both cases (no metadata cached and all
metadata cached).
The test provided by Pavel (first Link tag at the bottom), which uses
files with a large number of holes, was also used to measure the gains,
and it consists on a small C program and a shell script to invoke it.
The C program is the following:
$ cat pavels-test.c
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <linux/fiemap.h>
#define FILE_INTERVAL (1<<13) /* 8Kb */
long long interval(struct timeval t1, struct timeval t2)
{
long long val = 0;
val += (t2.tv_usec - t1.tv_usec);
val += (t2.tv_sec - t1.tv_sec) * 1000 * 1000;
return val;
}
int main(int argc, char **argv)
{
struct fiemap fiemap = {};
struct timeval t1, t2;
char data = 'a';
struct stat st;
int fd, off, file_size = FILE_INTERVAL;
if (argc != 3 && argc != 2) {
printf("usage: %s <path> [size]\n", argv[0]);
return 1;
}
if (argc == 3)
file_size = atoi(argv[2]);
if (file_size < FILE_INTERVAL)
file_size = FILE_INTERVAL;
file_size -= file_size % FILE_INTERVAL;
fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
perror("open");
return 1;
}
for (off = 0; off < file_size; off += FILE_INTERVAL) {
if (pwrite(fd, &data, 1, off) != 1) {
perror("pwrite");
close(fd);
return 1;
}
}
if (ftruncate(fd, file_size)) {
perror("ftruncate");
close(fd);
return 1;
}
if (fstat(fd, &st) < 0) {
perror("fstat");
close(fd);
return 1;
}
printf("size: %ld\n", st.st_size);
printf("actual size: %ld\n", st.st_blocks * 512);
fiemap.fm_length = FIEMAP_MAX_OFFSET;
gettimeofday(&t1, NULL);
if (ioctl(fd, FS_IOC_FIEMAP, &fiemap) < 0) {
perror("fiemap");
close(fd);
return 1;
}
gettimeofday(&t2, NULL);
printf("fiemap: fm_mapped_extents = %d\n",
fiemap.fm_mapped_extents);
printf("time = %lld us\n", interval(t1, t2));
close(fd);
return 0;
}
$ gcc -o pavels_test pavels_test.c
And the wrapper shell script:
$ cat fiemap-pavels-test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f -O no-holes $DEV
mount $DEV $MNT
echo
echo "*********** 256M ***********"
echo
./pavels-test $MNT/testfile $((1 << 28))
echo
./pavels-test $MNT/testfile $((1 << 28))
echo
echo "*********** 512M ***********"
echo
./pavels-test $MNT/testfile $((1 << 29))
echo
./pavels-test $MNT/testfile $((1 << 29))
echo
echo "*********** 1G ***********"
echo
./pavels-test $MNT/testfile $((1 << 30))
echo
./pavels-test $MNT/testfile $((1 << 30))
umount $MNT
Running his reproducer before applying the patchset:
*********** 256M ***********
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 4003133 us
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 4895330 us
*********** 512M ***********
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 30123675 us
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 33450934 us
*********** 1G ***********
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 224924074 us
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 217239242 us
Running it after applying the patchset:
*********** 256M ***********
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 29475 us
size: 268435456
actual size: 134217728
fiemap: fm_mapped_extents = 32768
time = 29307 us
*********** 512M ***********
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 58996 us
size: 536870912
actual size: 268435456
fiemap: fm_mapped_extents = 65536
time = 59115 us
*********** 1G ***********
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 116251
time = 124141 us
size: 1073741824
actual size: 536870912
fiemap: fm_mapped_extents = 131072
time = 119387 us
The speedup is massive, both on the first fiemap call and on the second
one as well, as his test creates files with many holes and small extents
(every extent follows a hole and precedes another hole).
For the 256M file we go from 4 seconds down to 29 milliseconds in the
first run, and then from 4.9 seconds down to 29 milliseconds again in the
second run, a speedup of 138x and 169x, respectively.
For the 512M file we go from 30.1 seconds down to 59 milliseconds in the
first run, and then from 33.5 seconds down to 59 milliseconds again in the
second run, a speedup of 510x and 568x, respectively.
For the 1G file, we go from 225 seconds down to 124 milliseconds in the
first run, and then from 217 seconds down to 119 milliseconds in the
second run, a speedup of 1815x and 1824x, respectively.
Reported-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Link: https://lore.kernel.org/linux-btrfs/21dd32c6-f1f9-f44a-466a-e18fdc6788a7@virtuozzo.com/
Reported-by: Dominique MARTINET <dominique.martinet@atmark-techno.com>
Link: https://lore.kernel.org/linux-btrfs/Ysace25wh5BbLd5f@atmark-techno.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-01 14:18:30 +01:00
|
|
|
ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
|
2022-09-01 14:18:25 +01:00
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-31 14:42:49 +03:00
|
|
|
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
|
2009-01-21 14:39:14 -05:00
|
|
|
}
|
|
|
|
|
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_writepages(struct address_space *mapping,
|
|
|
|
|
struct writeback_control *wbc)
|
2007-11-01 19:45:34 -04:00
|
|
|
{
|
2018-04-19 10:46:38 +03:00
|
|
|
return extent_writepages(mapping, wbc);
|
2007-11-01 19:45:34 -04:00
|
|
|
}
|
|
|
|
|
|
2020-06-01 21:47:05 -07:00
|
|
|
static void btrfs_readahead(struct readahead_control *rac)
|
2007-11-08 10:59:22 -05:00
|
|
|
{
|
2020-06-01 21:47:05 -07:00
|
|
|
extent_readahead(rac);
|
2007-11-08 10:59:22 -05:00
|
|
|
}
|
2018-04-19 10:46:36 +03:00
|
|
|
|
2021-07-26 14:35:03 +08:00
|
|
|
/*
|
2022-04-30 23:15:16 -04:00
|
|
|
* For release_folio() and invalidate_folio() we have a race window where
|
2022-02-09 20:21:39 +00:00
|
|
|
* folio_end_writeback() is called but the subpage spinlock is not yet released.
|
2021-07-26 14:35:03 +08:00
|
|
|
* If we continue to release/invalidate the page, we could cause use-after-free
|
|
|
|
|
* for subpage spinlock. So this function is to spin and wait for subpage
|
|
|
|
|
* spinlock.
|
|
|
|
|
*/
|
|
|
|
|
static void wait_subpage_spinlock(struct page *page)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
|
|
|
|
|
struct btrfs_subpage *subpage;
|
|
|
|
|
|
btrfs: make nodesize >= PAGE_SIZE case to reuse the non-subpage routine
The reason why we only support 64K page size for subpage is, for 64K
page size we can ensure no matter what the nodesize is, we can fit it
into one page.
When other page size come, especially like 16K, the limitation is a bit
limiting.
To remove such limitation, we allow nodesize >= PAGE_SIZE case to go the
non-subpage routine. By this, we can allow 4K sectorsize on 16K page
size.
Although this introduces another smaller limitation, the metadata can
not cross page boundary, which is already met by most recent mkfs.
Another small improvement is, we can avoid the overhead for metadata if
nodesize >= PAGE_SIZE.
For 4K sector size and 64K page size/node size, or 4K sector size and
16K page size/node size, we don't need to allocate extra memory for the
metadata pages.
Please note that, this patch will not yet enable other page size support
yet.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-13 13:22:09 +08:00
|
|
|
if (!btrfs_is_subpage(fs_info, page))
|
2021-07-26 14:35:03 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
ASSERT(PagePrivate(page) && page->private);
|
|
|
|
|
subpage = (struct btrfs_subpage *)page->private;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This may look insane as we just acquire the spinlock and release it,
|
|
|
|
|
* without doing anything. But we just want to make sure no one is
|
|
|
|
|
* still holding the subpage spinlock.
|
|
|
|
|
* And since the page is not dirty nor writeback, and we have page
|
|
|
|
|
* locked, the only possible way to hold a spinlock is from the endio
|
|
|
|
|
* function to clear page writeback.
|
|
|
|
|
*
|
|
|
|
|
* Here we just acquire the spinlock so that all existing callers
|
|
|
|
|
* should exit and we're safe to release/invalidate the page.
|
|
|
|
|
*/
|
|
|
|
|
spin_lock_irq(&subpage->lock);
|
|
|
|
|
spin_unlock_irq(&subpage->lock);
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-30 23:15:16 -04:00
|
|
|
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
|
2007-06-15 13:50:00 -04:00
|
|
|
{
|
2022-04-30 23:15:16 -04:00
|
|
|
int ret = try_release_extent_mapping(&folio->page, gfp_flags);
|
2021-07-26 14:35:03 +08:00
|
|
|
|
|
|
|
|
if (ret == 1) {
|
2022-04-30 23:15:16 -04:00
|
|
|
wait_subpage_spinlock(&folio->page);
|
|
|
|
|
clear_page_extent_mapped(&folio->page);
|
2021-07-26 14:35:03 +08:00
|
|
|
}
|
2007-08-27 16:49:44 -04:00
|
|
|
return ret;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2022-04-30 23:15:16 -04:00
|
|
|
static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
|
2008-07-17 12:53:50 -04:00
|
|
|
{
|
2022-04-30 23:15:16 -04:00
|
|
|
if (folio_test_writeback(folio) || folio_test_dirty(folio))
|
|
|
|
|
return false;
|
|
|
|
|
return __btrfs_release_folio(folio, gfp_flags);
|
2008-07-17 12:53:50 -04:00
|
|
|
}
|
|
|
|
|
|
2020-03-04 16:57:35 -08:00
|
|
|
#ifdef CONFIG_MIGRATION
|
2022-06-06 10:47:21 -04:00
|
|
|
static int btrfs_migrate_folio(struct address_space *mapping,
|
|
|
|
|
struct folio *dst, struct folio *src,
|
2020-03-04 16:57:35 -08:00
|
|
|
enum migrate_mode mode)
|
|
|
|
|
{
|
2022-06-06 10:47:21 -04:00
|
|
|
int ret = filemap_migrate_folio(mapping, dst, src, mode);
|
2020-03-04 16:57:35 -08:00
|
|
|
|
|
|
|
|
if (ret != MIGRATEPAGE_SUCCESS)
|
|
|
|
|
return ret;
|
|
|
|
|
|
2022-06-06 10:47:21 -04:00
|
|
|
if (folio_test_ordered(src)) {
|
|
|
|
|
folio_clear_ordered(src);
|
|
|
|
|
folio_set_ordered(dst);
|
2020-03-04 16:57:35 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return MIGRATEPAGE_SUCCESS;
|
|
|
|
|
}
|
2022-06-06 10:47:21 -04:00
|
|
|
#else
|
|
|
|
|
#define btrfs_migrate_folio NULL
|
2020-03-04 16:57:35 -08:00
|
|
|
#endif
|
|
|
|
|
|
2022-02-09 20:21:39 +00:00
|
|
|
static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
|
|
|
|
|
size_t length)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2022-02-09 20:21:39 +00:00
|
|
|
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
|
2021-05-31 16:50:46 +08:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
2020-08-31 14:42:43 +03:00
|
|
|
struct extent_io_tree *tree = &inode->io_tree;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2022-02-09 20:21:39 +00:00
|
|
|
u64 page_start = folio_pos(folio);
|
|
|
|
|
u64 page_end = page_start + folio_size(folio) - 1;
|
2021-04-06 19:54:53 +08:00
|
|
|
u64 cur;
|
2020-08-31 14:42:43 +03:00
|
|
|
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2009-09-02 16:53:46 -04:00
|
|
|
/*
|
2022-02-09 20:21:39 +00:00
|
|
|
* We have folio locked so no new ordered extent can be created on this
|
|
|
|
|
* page, nor bio can be submitted for this folio.
|
2009-09-02 16:53:46 -04:00
|
|
|
*
|
2022-02-09 20:21:39 +00:00
|
|
|
* But already submitted bio can still be finished on this folio.
|
|
|
|
|
* Furthermore, endio function won't skip folio which has Ordered
|
2021-04-07 19:22:13 +08:00
|
|
|
* (Private2) already cleared, so it's possible for endio and
|
2022-02-09 20:21:39 +00:00
|
|
|
* invalidate_folio to do the same ordered extent accounting twice
|
|
|
|
|
* on one folio.
|
2021-04-06 08:27:18 +08:00
|
|
|
*
|
|
|
|
|
* So here we wait for any submitted bios to finish, so that we won't
|
2022-02-09 20:21:39 +00:00
|
|
|
* do double ordered extent accounting on the same folio.
|
2009-09-02 16:53:46 -04:00
|
|
|
*/
|
2022-02-09 20:21:39 +00:00
|
|
|
folio_wait_writeback(folio);
|
|
|
|
|
wait_subpage_spinlock(&folio->page);
|
2009-09-02 16:53:46 -04:00
|
|
|
|
2021-05-31 16:50:55 +08:00
|
|
|
/*
|
|
|
|
|
* For subpage case, we have call sites like
|
|
|
|
|
* btrfs_punch_hole_lock_range() which passes range not aligned to
|
|
|
|
|
* sectorsize.
|
2022-02-09 20:21:39 +00:00
|
|
|
* If the range doesn't cover the full folio, we don't need to and
|
|
|
|
|
* shouldn't clear page extent mapped, as folio->private can still
|
2021-05-31 16:50:55 +08:00
|
|
|
* record subpage dirty bits for other part of the range.
|
|
|
|
|
*
|
2022-02-09 20:21:39 +00:00
|
|
|
* For cases that invalidate the full folio even the range doesn't
|
|
|
|
|
* cover the full folio, like invalidating the last folio, we're
|
2021-05-31 16:50:55 +08:00
|
|
|
* still safe to wait for ordered extent to finish.
|
|
|
|
|
*/
|
2022-03-29 23:25:06 -04:00
|
|
|
if (!(offset == 0 && length == folio_size(folio))) {
|
2022-04-30 23:15:16 -04:00
|
|
|
btrfs_release_folio(folio, GFP_NOFS);
|
2008-07-17 12:53:50 -04:00
|
|
|
return;
|
|
|
|
|
}
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
|
if (!inode_evicting)
|
2015-12-03 14:30:40 +01:00
|
|
|
lock_extent_bits(tree, page_start, page_end, &cached_state);
|
btrfs: fix double accounting of ordered extent for subpage case in btrfs_invalidapge
Commit dbfdb6d1b369 ("Btrfs: Search for all ordered extents that could
span across a page") make btrfs_invalidapage() to search all ordered
extents.
The offending code looks like this:
again:
start = page_start;
ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordred) {
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
/* Do the cleanup */
start = end + 1;
if (start < page_end)
goto again;
}
The behavior is indeed necessary for the incoming subpage support, but
when it iterates through all the ordered extents, it also resets the
search range @start.
This means, for the following cases, we can double account the ordered
extents, causing its bytes_left underflow:
Page offset
0 16K 32K
|<--- OE 1 --->|<--- OE 2 ---->|
As the first iteration will find ordered extent (OE) 1, which doesn't
cover the full page, thus after cleanup code, we need to retry again.
But again label will reset start to page_start, and we got OE 1 again,
which causes double accounting on OE 1, and cause OE 1's byte_left to
underflow.
This problem can only happen for subpage case, as for regular sectorsize
== PAGE_SIZE case, we will always find a OE ends at or after page end,
thus no way to trigger the problem.
Move the again label after start = page_start. There will be more
comprehensive rework to convert the open coded loop to a proper while
loop for subpage support.
Fixes: dbfdb6d1b369 ("Btrfs: Search for all ordered extents that could span across a page")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-27 14:38:48 +08:00
|
|
|
|
2021-04-06 19:54:53 +08:00
|
|
|
cur = page_start;
|
|
|
|
|
while (cur < page_end) {
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
|
bool delete_states;
|
|
|
|
|
u64 range_end;
|
2021-05-31 16:50:46 +08:00
|
|
|
u32 range_len;
|
2021-04-06 19:54:53 +08:00
|
|
|
|
|
|
|
|
ordered = btrfs_lookup_first_ordered_range(inode, cur,
|
|
|
|
|
page_end + 1 - cur);
|
|
|
|
|
if (!ordered) {
|
|
|
|
|
range_end = page_end;
|
|
|
|
|
/*
|
|
|
|
|
* No ordered extent covering this range, we are safe
|
|
|
|
|
* to delete all extent states in the range.
|
|
|
|
|
*/
|
|
|
|
|
delete_states = true;
|
|
|
|
|
goto next;
|
|
|
|
|
}
|
|
|
|
|
if (ordered->file_offset > cur) {
|
|
|
|
|
/*
|
|
|
|
|
* There is a range between [cur, oe->file_offset) not
|
|
|
|
|
* covered by any ordered extent.
|
|
|
|
|
* We are safe to delete all extent states, and handle
|
|
|
|
|
* the ordered extent in the next iteration.
|
|
|
|
|
*/
|
|
|
|
|
range_end = ordered->file_offset - 1;
|
|
|
|
|
delete_states = true;
|
|
|
|
|
goto next;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
range_end = min(ordered->file_offset + ordered->num_bytes - 1,
|
|
|
|
|
page_end);
|
2021-05-31 16:50:46 +08:00
|
|
|
ASSERT(range_end + 1 - cur < U32_MAX);
|
|
|
|
|
range_len = range_end + 1 - cur;
|
2022-02-09 20:21:39 +00:00
|
|
|
if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
|
2021-04-06 19:54:53 +08:00
|
|
|
/*
|
2021-04-07 19:22:13 +08:00
|
|
|
* If Ordered (Private2) is cleared, it means endio has
|
|
|
|
|
* already been executed for the range.
|
2021-04-06 19:54:53 +08:00
|
|
|
* We can't delete the extent states as
|
|
|
|
|
* btrfs_finish_ordered_io() may still use some of them.
|
|
|
|
|
*/
|
|
|
|
|
delete_states = false;
|
|
|
|
|
goto next;
|
|
|
|
|
}
|
2022-02-09 20:21:39 +00:00
|
|
|
btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
|
2021-04-06 19:54:53 +08:00
|
|
|
|
2008-07-17 13:53:27 -04:00
|
|
|
/*
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
* IO on this page will never be started, so we need to account
|
|
|
|
|
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
|
|
|
|
|
* here, must leave that up for the ordered extent completion.
|
2021-04-06 19:54:53 +08:00
|
|
|
*
|
|
|
|
|
* This will also unlock the range for incoming
|
|
|
|
|
* btrfs_finish_ordered_io().
|
2008-07-17 13:53:27 -04:00
|
|
|
*/
|
2013-11-19 22:29:35 +00:00
|
|
|
if (!inode_evicting)
|
2021-04-06 19:54:53 +08:00
|
|
|
clear_extent_bit(tree, cur, range_end,
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
EXTENT_DELALLOC |
|
2013-11-19 22:29:35 +00:00
|
|
|
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
|
2017-10-31 16:37:52 +01:00
|
|
|
EXTENT_DEFRAG, 1, 0, &cached_state);
|
2021-04-06 19:54:53 +08:00
|
|
|
|
|
|
|
|
spin_lock_irq(&inode->ordered_tree.lock);
|
|
|
|
|
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
|
|
|
|
|
ordered->truncated_len = min(ordered->truncated_len,
|
|
|
|
|
cur - ordered->file_offset);
|
|
|
|
|
spin_unlock_irq(&inode->ordered_tree.lock);
|
|
|
|
|
|
|
|
|
|
if (btrfs_dec_test_ordered_pending(inode, &ordered,
|
2021-07-26 14:15:10 +02:00
|
|
|
cur, range_end + 1 - cur)) {
|
2021-04-06 19:54:53 +08:00
|
|
|
btrfs_finish_ordered_io(ordered);
|
|
|
|
|
/*
|
|
|
|
|
* The ordered extent has finished, now we're again
|
|
|
|
|
* safe to delete all extent states of the range.
|
|
|
|
|
*/
|
|
|
|
|
delete_states = true;
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* btrfs_finish_ordered_io() will get executed by endio
|
|
|
|
|
* of other pages, thus we can't delete extent states
|
|
|
|
|
* anymore
|
|
|
|
|
*/
|
|
|
|
|
delete_states = false;
|
|
|
|
|
}
|
|
|
|
|
next:
|
|
|
|
|
if (ordered)
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
2009-09-02 16:53:46 -04:00
|
|
|
/*
|
2021-04-06 19:54:53 +08:00
|
|
|
* Qgroup reserved space handler
|
|
|
|
|
* Sector(s) here will be either:
|
2021-04-06 08:27:18 +08:00
|
|
|
*
|
2021-04-06 19:54:53 +08:00
|
|
|
* 1) Already written to disk or bio already finished
|
|
|
|
|
* Then its QGROUP_RESERVED bit in io_tree is already cleared.
|
|
|
|
|
* Qgroup will be handled by its qgroup_record then.
|
|
|
|
|
* btrfs_qgroup_free_data() call will do nothing here.
|
|
|
|
|
*
|
|
|
|
|
* 2) Not written to disk yet
|
|
|
|
|
* Then btrfs_qgroup_free_data() call will clear the
|
|
|
|
|
* QGROUP_RESERVED bit of its io_tree, and free the qgroup
|
|
|
|
|
* reserved data space.
|
|
|
|
|
* Since the IO will never happen for this page.
|
2009-09-02 16:53:46 -04:00
|
|
|
*/
|
2021-04-06 19:54:53 +08:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
|
2013-11-19 22:29:35 +00:00
|
|
|
if (!inode_evicting) {
|
2021-04-06 19:54:53 +08:00
|
|
|
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
|
|
|
|
|
EXTENT_DELALLOC | EXTENT_UPTODATE |
|
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
|
|
|
|
|
delete_states, &cached_state);
|
2013-11-19 22:29:35 +00:00
|
|
|
}
|
2021-04-06 19:54:53 +08:00
|
|
|
cur = range_end + 1;
|
2013-11-19 22:29:35 +00:00
|
|
|
}
|
2015-09-29 10:35:16 +08:00
|
|
|
/*
|
2021-04-06 19:54:53 +08:00
|
|
|
* We have iterated through all ordered extents of the page, the page
|
2021-04-07 19:22:13 +08:00
|
|
|
* should not have Ordered (Private2) anymore, or the above iteration
|
|
|
|
|
* did something wrong.
|
2015-09-29 10:35:16 +08:00
|
|
|
*/
|
2022-02-09 20:21:39 +00:00
|
|
|
ASSERT(!folio_test_ordered(folio));
|
|
|
|
|
btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
|
2021-04-06 19:54:53 +08:00
|
|
|
if (!inode_evicting)
|
2022-04-30 23:15:16 -04:00
|
|
|
__btrfs_release_folio(folio, GFP_NOFS);
|
2022-02-09 20:21:39 +00:00
|
|
|
clear_page_extent_mapped(&folio->page);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2007-06-15 13:50:00 -04:00
|
|
|
/*
|
|
|
|
|
* btrfs_page_mkwrite() is not allowed to change the file size as it gets
|
|
|
|
|
* called from a page fault handler when a page is first dirtied. Hence we must
|
|
|
|
|
* be careful to check for EOF conditions here. We set the page up correctly
|
|
|
|
|
* for a written page which means we get ENOSPC checking when writing into
|
|
|
|
|
* holes and correct delalloc and unwritten extent mapping on filesystems that
|
|
|
|
|
* support these features.
|
|
|
|
|
*
|
|
|
|
|
* We are not allowed to take the i_mutex here so we have to play games to
|
|
|
|
|
* protect against truncate races as the page could now be beyond EOF. Because
|
2018-05-11 13:13:29 -07:00
|
|
|
* truncate_setsize() writes the inode size before removing pages, once we have
|
|
|
|
|
* the page lock we can determine safely if the page is beyond EOF. If it is not
|
2007-06-15 13:50:00 -04:00
|
|
|
* beyond EOF, then the page is guaranteed safe against truncation until we
|
|
|
|
|
* unlock the page.
|
|
|
|
|
*/
|
2018-06-06 19:54:44 +05:30
|
|
|
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
|
2007-06-15 13:50:00 -04:00
|
|
|
{
|
2009-03-31 15:23:21 -07:00
|
|
|
struct page *page = vmf->page;
|
2017-02-24 14:56:41 -08:00
|
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-07-17 12:53:50 -04:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2017-02-27 15:10:38 +08:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2008-07-17 12:53:50 -04:00
|
|
|
unsigned long zero_start;
|
2007-06-15 13:50:00 -04:00
|
|
|
loff_t size;
|
2018-06-06 19:54:44 +05:30
|
|
|
vm_fault_t ret;
|
|
|
|
|
int ret2;
|
2012-01-25 13:47:40 -05:00
|
|
|
int reserved = 0;
|
2016-01-21 15:55:57 +05:30
|
|
|
u64 reserved_space;
|
2007-08-27 16:49:44 -04:00
|
|
|
u64 page_start;
|
2008-07-17 12:53:50 -04:00
|
|
|
u64 page_end;
|
2016-01-21 15:55:57 +05:30
|
|
|
u64 end;
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
reserved_space = PAGE_SIZE;
|
2007-06-15 13:50:00 -04:00
|
|
|
|
2012-06-12 16:20:45 +02:00
|
|
|
sb_start_pagefault(inode->i_sb);
|
2015-09-08 17:25:54 +08:00
|
|
|
page_start = page_offset(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
page_end = page_start + PAGE_SIZE - 1;
|
2016-01-21 15:55:57 +05:30
|
|
|
end = page_end;
|
2015-09-08 17:25:54 +08:00
|
|
|
|
2016-01-21 15:55:57 +05:30
|
|
|
/*
|
|
|
|
|
* Reserving delalloc space after obtaining the page lock can lead to
|
|
|
|
|
* deadlock. For example, if a dirty page is locked by this function
|
|
|
|
|
* and the call to btrfs_delalloc_reserve_space() ends up triggering
|
2022-06-21 09:49:44 +02:00
|
|
|
* dirty page write out, then the btrfs_writepages() function could
|
2016-01-21 15:55:57 +05:30
|
|
|
* end up waiting indefinitely to get a lock on the page currently
|
|
|
|
|
* being processed by btrfs_page_mkwrite() function.
|
|
|
|
|
*/
|
2020-06-03 08:55:42 +03:00
|
|
|
ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
|
|
|
|
|
page_start, reserved_space);
|
2018-06-06 19:54:44 +05:30
|
|
|
if (!ret2) {
|
|
|
|
|
ret2 = file_update_time(vmf->vma->vm_file);
|
2012-01-25 13:47:40 -05:00
|
|
|
reserved = 1;
|
|
|
|
|
}
|
2018-06-06 19:54:44 +05:30
|
|
|
if (ret2) {
|
|
|
|
|
ret = vmf_error(ret2);
|
2012-01-25 13:47:40 -05:00
|
|
|
if (reserved)
|
|
|
|
|
goto out;
|
|
|
|
|
goto out_noreserve;
|
2009-03-31 15:23:23 -07:00
|
|
|
}
|
2007-12-21 16:27:21 -05:00
|
|
|
|
2009-03-31 15:23:23 -07:00
|
|
|
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
|
2008-07-17 12:53:50 -04:00
|
|
|
again:
|
2021-02-10 17:14:33 -05:00
|
|
|
down_read(&BTRFS_I(inode)->i_mmap_lock);
|
2007-06-15 13:50:00 -04:00
|
|
|
lock_page(page);
|
|
|
|
|
size = i_size_read(inode);
|
2007-08-27 16:49:44 -04:00
|
|
|
|
2007-06-15 13:50:00 -04:00
|
|
|
if ((page->mapping != inode->i_mapping) ||
|
2008-07-17 12:53:50 -04:00
|
|
|
(page_start >= size)) {
|
2007-06-15 13:50:00 -04:00
|
|
|
/* page got truncated out from underneath us */
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
2008-07-17 12:53:50 -04:00
|
|
|
wait_on_page_writeback(page);
|
|
|
|
|
|
2015-12-03 14:30:40 +01:00
|
|
|
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
|
2021-01-26 16:34:00 +08:00
|
|
|
ret2 = set_page_extent_mapped(page);
|
|
|
|
|
if (ret2 < 0) {
|
|
|
|
|
ret = vmf_error(ret2);
|
|
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
2008-07-17 12:53:50 -04:00
|
|
|
|
2008-07-17 13:53:27 -04:00
|
|
|
/*
|
|
|
|
|
* we can't set the delalloc bits if there are pending ordered
|
|
|
|
|
* extents. Drop our locks and wait for them to finish
|
|
|
|
|
*/
|
2017-02-20 13:50:49 +02:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
|
|
|
|
|
PAGE_SIZE);
|
2008-07-17 12:53:50 -04:00
|
|
|
if (ordered) {
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
2017-12-12 21:43:52 +01:00
|
|
|
&cached_state);
|
2008-07-17 12:53:50 -04:00
|
|
|
unlock_page(page);
|
2021-02-10 17:14:33 -05:00
|
|
|
up_read(&BTRFS_I(inode)->i_mmap_lock);
|
2020-09-18 12:15:53 +03:00
|
|
|
btrfs_start_ordered_extent(ordered, 1);
|
2008-07-17 12:53:50 -04:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
goto again;
|
|
|
|
|
}
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
|
2016-06-15 09:22:56 -04:00
|
|
|
reserved_space = round_up(size - page_start,
|
2016-06-22 18:54:23 -04:00
|
|
|
fs_info->sectorsize);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
if (reserved_space < PAGE_SIZE) {
|
2016-01-21 15:55:57 +05:30
|
|
|
end = page_start + reserved_space - 1;
|
2020-06-03 08:55:40 +03:00
|
|
|
btrfs_delalloc_release_space(BTRFS_I(inode),
|
|
|
|
|
data_reserved, page_start,
|
|
|
|
|
PAGE_SIZE - reserved_space, true);
|
2016-01-21 15:55:57 +05:30
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-10-01 17:10:23 -04:00
|
|
|
/*
|
2016-12-13 12:15:19 -08:00
|
|
|
* page_mkwrite gets called when the page is firstly dirtied after it's
|
|
|
|
|
* faulted in, but write(2) could also dirty a page and set delalloc
|
|
|
|
|
* bits, thus in this case for space account reason, we still need to
|
|
|
|
|
* clear any delalloc bits within this page range since we have to
|
|
|
|
|
* reserve data&meta space before lock_page() (see above comments).
|
2009-10-01 17:10:23 -04:00
|
|
|
*/
|
2016-01-21 15:55:57 +05:30
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
|
2019-08-15 14:04:04 -07:00
|
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
|
|
|
|
|
EXTENT_DEFRAG, 0, 0, &cached_state);
|
2009-10-01 17:10:23 -04:00
|
|
|
|
2020-06-03 08:55:35 +03:00
|
|
|
ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
|
2019-07-17 16:18:17 +03:00
|
|
|
&cached_state);
|
2018-06-06 19:54:44 +05:30
|
|
|
if (ret2) {
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
2017-12-12 21:43:52 +01:00
|
|
|
&cached_state);
|
2009-09-11 16:12:44 -04:00
|
|
|
ret = VM_FAULT_SIGBUS;
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
}
|
2007-06-15 13:50:00 -04:00
|
|
|
|
|
|
|
|
/* page is wholly or partially inside EOF */
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
if (page_start + PAGE_SIZE > size)
|
2018-12-05 15:23:03 +01:00
|
|
|
zero_start = offset_in_page(size);
|
2007-06-15 13:50:00 -04:00
|
|
|
else
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
zero_start = PAGE_SIZE;
|
2007-06-15 13:50:00 -04:00
|
|
|
|
2022-06-01 13:47:54 +02:00
|
|
|
if (zero_start != PAGE_SIZE)
|
btrfs: use memzero_page() instead of open coded kmap pattern
There are many places where kmap/memset/kunmap patterns occur.
Use the newly lifted memzero_page() to eliminate direct uses of kmap and
leverage the new core functions use of kmap_local_page().
The development of this patch was aided by the following coccinelle
script:
// <smpl>
// SPDX-License-Identifier: GPL-2.0-only
// Find kmap/memset/kunmap pattern and replace with memset*page calls
//
// NOTE: Offsets and other expressions may be more complex than what the script
// will automatically generate. Therefore a catchall rule is provided to find
// the pattern which then must be evaluated by hand.
//
// Confidence: Low
// Copyright: (C) 2021 Intel Corporation
// URL: http://coccinelle.lip6.fr/
// Comments:
// Options:
//
// Then the memset pattern
//
@ memset_rule1 @
expression page, V, L, Off;
identifier ptr;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
-memset(ptr, 0, L);
+memzero_page(page, 0, L);
|
-memset(ptr + Off, 0, L);
+memzero_page(page, Off, L);
|
-memset(ptr, V, L);
+memset_page(page, V, 0, L);
|
-memset(ptr + Off, V, L);
+memset_page(page, V, Off, L);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule1
@
identifier memset_rule1.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
//
// Catch all
//
@ memset_rule2 @
expression page;
identifier ptr;
expression GenTo, GenSize, GenValue;
type VP;
@@
(
-VP ptr = kmap(page);
|
-ptr = kmap(page);
|
-VP ptr = kmap_atomic(page);
|
-ptr = kmap_atomic(page);
)
<+...
(
//
// Some call sites have complex expressions within the memset/memcpy
// The follow are catch alls which need to be evaluated by hand.
//
-memset(GenTo, 0, GenSize);
+memzero_pageExtra(page, GenTo, GenSize);
|
-memset(GenTo, GenValue, GenSize);
+memset_pageExtra(page, GenValue, GenTo, GenSize);
)
...+>
(
-kunmap(page);
|
-kunmap_atomic(ptr);
)
// Remove any pointers left unused
@
depends on memset_rule2
@
identifier memset_rule2.ptr;
type VP, VP1;
@@
-VP ptr;
... when != ptr;
? VP1 ptr;
// </smpl>
Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-04 18:40:07 -07:00
|
|
|
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
|
2022-06-01 13:47:54 +02:00
|
|
|
|
2021-09-27 15:21:49 +08:00
|
|
|
btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
|
2021-05-31 16:50:52 +08:00
|
|
|
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
|
|
|
|
|
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
|
2009-03-31 13:27:11 -04:00
|
|
|
|
btrfs: fix race between marking inode needs to be logged and log syncing
We have a race between marking that an inode needs to be logged, either
at btrfs_set_inode_last_trans() or at btrfs_page_mkwrite(), and between
btrfs_sync_log(). The following steps describe how the race happens.
1) We are at transaction N;
2) Inode I was previously fsynced in the current transaction so it has:
inode->logged_trans set to N;
3) The inode's root currently has:
root->log_transid set to 1
root->last_log_commit set to 0
Which means only one log transaction was committed to far, log
transaction 0. When a log tree is created we set ->log_transid and
->last_log_commit of its parent root to 0 (at btrfs_add_log_tree());
4) One more range of pages is dirtied in inode I;
5) Some task A starts an fsync against some other inode J (same root), and
so it joins log transaction 1.
Before task A calls btrfs_sync_log()...
6) Task B starts an fsync against inode I, which currently has the full
sync flag set, so it starts delalloc and waits for the ordered extent
to complete before calling btrfs_inode_in_log() at btrfs_sync_file();
7) During ordered extent completion we have btrfs_update_inode() called
against inode I, which in turn calls btrfs_set_inode_last_trans(),
which does the following:
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
inode->last_sub_trans = inode->root->log_transid;
inode->last_log_commit = inode->root->last_log_commit;
spin_unlock(&inode->lock);
So ->last_trans is set to N and ->last_sub_trans set to 1.
But before setting ->last_log_commit...
8) Task A is at btrfs_sync_log():
- it increments root->log_transid to 2
- starts writeback for all log tree extent buffers
- waits for the writeback to complete
- writes the super blocks
- updates root->last_log_commit to 1
It's a lot of slow steps between updating root->log_transid and
root->last_log_commit;
9) The task doing the ordered extent completion, currently at
btrfs_set_inode_last_trans(), then finally runs:
inode->last_log_commit = inode->root->last_log_commit;
spin_unlock(&inode->lock);
Which results in inode->last_log_commit being set to 1.
The ordered extent completes;
10) Task B is resumed, and it calls btrfs_inode_in_log() which returns
true because we have all the following conditions met:
inode->logged_trans == N which matches fs_info->generation &&
inode->last_subtrans (1) <= inode->last_log_commit (1) &&
inode->last_subtrans (1) <= root->last_log_commit (1) &&
list inode->extent_tree.modified_extents is empty
And as a consequence we return without logging the inode, so the
existing logged version of the inode does not point to the extent
that was written after the previous fsync.
It should be impossible in practice for one task be able to do so much
progress in btrfs_sync_log() while another task is at
btrfs_set_inode_last_trans() right after it reads root->log_transid and
before it reads root->last_log_commit. Even if kernel preemption is enabled
we know the task at btrfs_set_inode_last_trans() can not be preempted
because it is holding the inode's spinlock.
However there is another place where we do the same without holding the
spinlock, which is in the memory mapped write path at:
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
(...)
BTRFS_I(inode)->last_trans = fs_info->generation;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
(...)
So with preemption happening after setting ->last_sub_trans and before
setting ->last_log_commit, it is less of a stretch to have another task
do enough progress at btrfs_sync_log() such that the task doing the memory
mapped write ends up with ->last_sub_trans and ->last_log_commit set to
the same value. It is still a big stretch to get there, as the task doing
btrfs_sync_log() has to start writeback, wait for its completion and write
the super blocks.
So fix this in two different ways:
1) For btrfs_set_inode_last_trans(), simply set ->last_log_commit to the
value of ->last_sub_trans minus 1;
2) For btrfs_page_mkwrite() only set the inode's ->last_sub_trans, just
like we do for buffered and direct writes at btrfs_file_write_iter(),
which is all we need to make sure multiple writes and fsyncs to an
inode in the same transaction never result in an fsync missing that
the inode changed and needs to be logged. Turn this into a helper
function and use it both at btrfs_page_mkwrite() and at
btrfs_file_write_iter() - this also fixes the problem that at
btrfs_page_mkwrite() we were setting those fields without the
protection of the inode's spinlock.
This is an extremely unlikely race to happen in practice.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-23 12:08:48 +00:00
|
|
|
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
|
2009-10-13 13:21:08 -04:00
|
|
|
|
2017-12-12 21:43:52 +01:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
|
2021-02-10 17:14:33 -05:00
|
|
|
up_read(&BTRFS_I(inode)->i_mmap_lock);
|
2007-06-15 13:50:00 -04:00
|
|
|
|
2019-12-03 16:59:25 +08:00
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
|
|
|
|
|
sb_end_pagefault(inode->i_sb);
|
|
|
|
|
extent_changeset_free(data_reserved);
|
|
|
|
|
return VM_FAULT_LOCKED;
|
2018-06-25 10:03:41 -07:00
|
|
|
|
|
|
|
|
out_unlock:
|
2007-06-15 13:50:00 -04:00
|
|
|
unlock_page(page);
|
2021-02-10 17:14:33 -05:00
|
|
|
up_read(&BTRFS_I(inode)->i_mmap_lock);
|
2007-12-21 16:27:21 -05:00
|
|
|
out:
|
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents()
[Background]
Btrfs qgroup uses two types of reserved space for METADATA space,
PERTRANS and PREALLOC.
PERTRANS is metadata space reserved for each transaction started by
btrfs_start_transaction().
While PREALLOC is for delalloc, where we reserve space before joining a
transaction, and finally it will be converted to PERTRANS after the
writeback is done.
[Inconsistency]
However there is inconsistency in how we handle PREALLOC metadata space.
The most obvious one is:
In btrfs_buffered_write():
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true);
We always free qgroup PREALLOC meta space.
While in btrfs_truncate_block():
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
We only free qgroup PREALLOC meta space when something went wrong.
[The Correct Behavior]
The correct behavior should be the one in btrfs_buffered_write(), we
should always free PREALLOC metadata space.
The reason is, the btrfs_delalloc_* mechanism works by:
- Reserve metadata first, even it's not necessary
In btrfs_delalloc_reserve_metadata()
- Free the unused metadata space
Normally in:
btrfs_delalloc_release_extents()
|- btrfs_inode_rsv_release()
Here we do calculation on whether we should release or not.
E.g. for 64K buffered write, the metadata rsv works like:
/* The first page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=0
total: num_bytes=calc_inode_reservations()
/* The first page caused one outstanding extent, thus needs metadata
rsv */
/* The 2nd page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed
/* The 2nd page doesn't cause new outstanding extent, needs no new meta
rsv, so we free what we have reserved */
/* The 3rd~16th pages */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed (still space for one outstanding extent)
This means, if btrfs_delalloc_release_extents() determines to free some
space, then those space should be freed NOW.
So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other
than btrfs_qgroup_convert_reserved_meta().
The good news is:
- The callers are not that hot
The hottest caller is in btrfs_buffered_write(), which is already
fixed by commit 336a8bb8e36a ("btrfs: Fix wrong
btrfs_delalloc_release_extents parameter"). Thus it's not that
easy to cause false EDQUOT.
- The trans commit in advance for qgroup would hide the bug
Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction
commit more aggressive"), when btrfs qgroup metadata free space is slow,
it will try to commit transaction and free the wrongly converted
PERTRANS space, so it's not that easy to hit such bug.
[FIX]
So to fix the problem, remove the @qgroup_free parameter for
btrfs_delalloc_release_extents(), and always pass true to
btrfs_inode_rsv_release().
Reported-by: Filipe Manana <fdmanana@suse.com>
Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 14:34:51 +08:00
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
|
2020-06-03 08:55:40 +03:00
|
|
|
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:32 +08:00
|
|
|
reserved_space, (ret != 0));
|
2012-01-25 13:47:40 -05:00
|
|
|
out_noreserve:
|
2012-06-12 16:20:45 +02:00
|
|
|
sb_end_pagefault(inode->i_sb);
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_free(data_reserved);
|
2007-06-15 13:50:00 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-06 20:40:31 +00:00
|
|
|
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2021-12-03 17:18:09 -05:00
|
|
|
struct btrfs_truncate_control control = {
|
2021-12-03 17:18:15 -05:00
|
|
|
.inode = BTRFS_I(inode),
|
2021-12-03 17:18:14 -05:00
|
|
|
.ino = btrfs_ino(BTRFS_I(inode)),
|
2021-12-03 17:18:09 -05:00
|
|
|
.min_type = BTRFS_EXTENT_DATA_KEY,
|
2021-12-03 17:18:13 -05:00
|
|
|
.clear_extent_range = true,
|
2021-12-03 17:18:09 -05:00
|
|
|
};
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-05-03 10:40:22 -04:00
|
|
|
struct btrfs_block_rsv *rsv;
|
2018-05-22 09:59:50 -07:00
|
|
|
int ret;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
2016-06-22 18:54:23 -04:00
|
|
|
u64 mask = fs_info->sectorsize - 1;
|
2019-08-22 15:14:33 -04:00
|
|
|
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2018-02-06 20:40:31 +00:00
|
|
|
if (!skip_writeback) {
|
|
|
|
|
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
|
|
|
|
|
(u64)-1);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2011-05-03 10:40:22 -04:00
|
|
|
/*
|
2018-05-11 13:13:32 -07:00
|
|
|
* Yes ladies and gentlemen, this is indeed ugly. We have a couple of
|
|
|
|
|
* things going on here:
|
2011-05-03 10:40:22 -04:00
|
|
|
*
|
2018-05-11 13:13:32 -07:00
|
|
|
* 1) We need to reserve space to update our inode.
|
2011-05-03 10:40:22 -04:00
|
|
|
*
|
2018-05-11 13:13:32 -07:00
|
|
|
* 2) We need to have something to cache all the space that is going to
|
2011-05-03 10:40:22 -04:00
|
|
|
* be free'd up by the truncate operation, but also have some slack
|
|
|
|
|
* space reserved in case it uses space during the truncate (thank you
|
|
|
|
|
* very much snapshotting).
|
|
|
|
|
*
|
2018-05-11 13:13:32 -07:00
|
|
|
* And we need these to be separate. The fact is we can use a lot of
|
2011-05-03 10:40:22 -04:00
|
|
|
* space doing the truncate, and we have no earthly idea how much space
|
2016-05-19 21:18:45 -04:00
|
|
|
* we will use, so we need the truncate reservation to be separate so it
|
2018-05-11 13:13:32 -07:00
|
|
|
* doesn't end up using space reserved for updating the inode. We also
|
|
|
|
|
* need to be able to stop the transaction and start a new one, which
|
|
|
|
|
* means we need to be able to update the inode several times, and we
|
|
|
|
|
* have no idea of knowing how many times that will be, so we can't just
|
|
|
|
|
* reserve 1 item for the entirety of the operation, so that has to be
|
|
|
|
|
* done separately as well.
|
2011-05-03 10:40:22 -04:00
|
|
|
*
|
|
|
|
|
* So that leaves us with
|
|
|
|
|
*
|
2018-05-11 13:13:32 -07:00
|
|
|
* 1) rsv - for the truncate reservation, which we will steal from the
|
2011-05-03 10:40:22 -04:00
|
|
|
* transaction reservation.
|
2018-05-11 13:13:32 -07:00
|
|
|
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
|
2011-05-03 10:40:22 -04:00
|
|
|
* updating the inode.
|
|
|
|
|
*/
|
2016-06-22 18:54:24 -04:00
|
|
|
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
|
2011-05-03 10:40:22 -04:00
|
|
|
if (!rsv)
|
|
|
|
|
return -ENOMEM;
|
2011-08-29 11:01:31 -04:00
|
|
|
rsv->size = min_size;
|
2022-06-23 17:08:14 +02:00
|
|
|
rsv->failfast = true;
|
2011-03-04 14:37:08 -05:00
|
|
|
|
2011-08-08 13:46:15 -04:00
|
|
|
/*
|
2011-08-19 10:29:59 -04:00
|
|
|
* 1 for the truncate slack space
|
2011-08-08 13:46:15 -04:00
|
|
|
* 1 for updating the inode.
|
|
|
|
|
*/
|
2013-01-07 17:03:21 -05:00
|
|
|
trans = btrfs_start_transaction(root, 2);
|
2011-05-03 10:40:22 -04:00
|
|
|
if (IS_ERR(trans)) {
|
2018-05-22 09:59:50 -07:00
|
|
|
ret = PTR_ERR(trans);
|
2011-05-03 10:40:22 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
2011-03-04 14:37:08 -05:00
|
|
|
|
2011-08-08 13:46:15 -04:00
|
|
|
/* Migrate the slack space for the truncate to our reserve */
|
2016-06-22 18:54:23 -04:00
|
|
|
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
|
2018-08-04 21:10:55 +08:00
|
|
|
min_size, false);
|
2011-05-03 10:40:22 -04:00
|
|
|
BUG_ON(ret);
|
2011-03-04 14:37:08 -05:00
|
|
|
|
2012-08-27 17:48:15 -04:00
|
|
|
trans->block_rsv = rsv;
|
2011-08-08 13:46:15 -04:00
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
while (1) {
|
2021-12-03 17:18:05 -05:00
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
|
const u64 new_size = inode->i_size;
|
|
|
|
|
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
|
|
|
|
|
|
2021-12-03 17:18:09 -05:00
|
|
|
control.new_size = new_size;
|
2021-12-03 17:18:05 -05:00
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
|
|
|
|
|
&cached_state);
|
|
|
|
|
/*
|
|
|
|
|
* We want to drop from the next block forward in case this new
|
|
|
|
|
* size is not block aligned since we will be keeping the last
|
|
|
|
|
* block of the extent just the way it is.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode),
|
|
|
|
|
ALIGN(new_size, fs_info->sectorsize),
|
|
|
|
|
(u64)-1, 0);
|
|
|
|
|
|
2021-12-03 17:18:15 -05:00
|
|
|
ret = btrfs_truncate_inode_items(trans, root, &control);
|
2021-12-03 17:18:10 -05:00
|
|
|
|
2021-12-03 17:18:11 -05:00
|
|
|
inode_sub_bytes(inode, control.sub_bytes);
|
2021-12-03 17:18:10 -05:00
|
|
|
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
|
|
|
|
|
|
2021-12-03 17:18:05 -05:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
|
|
|
|
|
(u64)-1, &cached_state);
|
|
|
|
|
|
2017-10-19 14:16:02 -04:00
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
2018-05-22 09:59:50 -07:00
|
|
|
if (ret != -ENOSPC && ret != -EAGAIN)
|
2009-11-12 09:35:36 +00:00
|
|
|
break;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2018-05-22 09:59:50 -07:00
|
|
|
if (ret)
|
2011-01-31 16:03:11 -05:00
|
|
|
break;
|
2012-08-27 17:48:15 -04:00
|
|
|
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2012-08-27 17:48:15 -04:00
|
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
|
|
|
if (IS_ERR(trans)) {
|
2018-05-22 09:59:50 -07:00
|
|
|
ret = PTR_ERR(trans);
|
2012-08-27 17:48:15 -04:00
|
|
|
trans = NULL;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-10 10:59:31 +02:00
|
|
|
btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
|
2016-06-22 18:54:23 -04:00
|
|
|
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
|
2018-08-04 21:10:55 +08:00
|
|
|
rsv, min_size, false);
|
2012-08-27 17:48:15 -04:00
|
|
|
BUG_ON(ret); /* shouldn't happen */
|
|
|
|
|
trans->block_rsv = rsv;
|
2009-11-12 09:35:36 +00:00
|
|
|
}
|
|
|
|
|
|
2017-10-19 14:16:02 -04:00
|
|
|
/*
|
|
|
|
|
* We can't call btrfs_truncate_block inside a trans handle as we could
|
2021-12-03 17:18:04 -05:00
|
|
|
* deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
|
|
|
|
|
* know we've truncated everything except the last little bit, and can
|
|
|
|
|
* do btrfs_truncate_block and then update the disk_i_size.
|
2017-10-19 14:16:02 -04:00
|
|
|
*/
|
2021-12-03 17:18:04 -05:00
|
|
|
if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
|
2017-10-19 14:16:02 -04:00
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
|
|
|
|
|
2020-11-02 16:49:03 +02:00
|
|
|
ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
|
2017-10-19 14:16:02 -04:00
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2020-11-02 16:48:53 +02:00
|
|
|
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
|
2017-10-19 14:16:02 -04:00
|
|
|
}
|
|
|
|
|
|
2011-11-08 14:49:59 -05:00
|
|
|
if (trans) {
|
2018-05-22 09:59:50 -07:00
|
|
|
int ret2;
|
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
2020-11-02 16:48:59 +02:00
|
|
|
ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2018-05-22 09:59:50 -07:00
|
|
|
if (ret2 && !ret)
|
|
|
|
|
ret = ret2;
|
2008-07-24 12:17:14 -04:00
|
|
|
|
2018-05-22 09:59:50 -07:00
|
|
|
ret2 = btrfs_end_transaction(trans);
|
|
|
|
|
if (ret2 && !ret)
|
|
|
|
|
ret = ret2;
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2011-11-08 14:49:59 -05:00
|
|
|
}
|
2011-05-03 10:40:22 -04:00
|
|
|
out:
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_free_block_rsv(fs_info, rsv);
|
2021-05-24 11:35:55 +01:00
|
|
|
/*
|
|
|
|
|
* So if we truncate and then write and fsync we normally would just
|
|
|
|
|
* write the extents that changed, which is a problem if we need to
|
|
|
|
|
* first truncate that entire inode. So set this flag so we write out
|
|
|
|
|
* all of the extents in the inode to the sync log so we're completely
|
|
|
|
|
* safe.
|
|
|
|
|
*
|
|
|
|
|
* If no extents were dropped or trimmed we don't need to force the next
|
|
|
|
|
* fsync to truncate all the inode's items from the log and re-log them
|
|
|
|
|
* all. This means the truncate operation did not change the file size,
|
|
|
|
|
* or changed it to a smaller size but there was only an implicit hole
|
|
|
|
|
* between the old i_size and the new i_size, and there were no prealloc
|
|
|
|
|
* extents beyond i_size to drop.
|
|
|
|
|
*/
|
2021-12-03 17:18:09 -05:00
|
|
|
if (control.extents_found > 0)
|
btrfs: reset last_reflink_trans after fsyncing inode
When an inode has a last_reflink_trans matching the current transaction,
we have to take special care when logging its checksums in order to
avoid getting checksum items with overlapping ranges in a log tree,
which could result in missing checksums after log replay (more on that
in the changelogs of commit 40e046acbd2f36 ("Btrfs: fix missing data
checksums after replaying a log tree") and commit e289f03ea79bbc ("btrfs:
fix corrupt log due to concurrent fsync of inodes with shared extents")).
We also need to make sure a full fsync will copy all old file extent
items it finds in modified leaves, because they might have been copied
from some other inode.
However once we fsync an inode, we don't need to keep paying the price of
that extra special care in future fsyncs done in the same transaction,
unless the inode is used for another reflink operation or the full sync
flag is set on it (truncate, failure to allocate extent maps for holes,
and other exceptional and infrequent cases).
So after we fsync an inode reset its last_unlink_trans to zero. In case
another reflink happens, we continue to update the last_reflink_trans of
the inode, just as before. Also set last_reflink_trans to the generation
of the last transaction that modified the inode whenever we need to set
the full sync flag on the inode, just like when we need to load an inode
from disk after eviction.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-02-17 12:12:06 +00:00
|
|
|
btrfs_set_inode_full_sync(BTRFS_I(inode));
|
2011-05-03 10:40:22 -04:00
|
|
|
|
2018-05-22 09:59:50 -07:00
|
|
|
return ret;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
|
|
|
|
|
struct inode *dir)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (inode) {
|
|
|
|
|
/*
|
|
|
|
|
* Subvolumes don't inherit the sgid bit or the parent's gid if
|
|
|
|
|
* the parent's sgid bit is set. This is probably a bug.
|
|
|
|
|
*/
|
|
|
|
|
inode_init_owner(mnt_userns, inode, NULL,
|
|
|
|
|
S_IFDIR | (~current_umask() & S_IRWXUGO));
|
|
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
|
}
|
|
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
struct inode *btrfs_alloc_inode(struct super_block *sb)
|
|
|
|
|
{
|
2017-10-19 14:15:57 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_inode *ei;
|
2010-05-16 10:46:25 -04:00
|
|
|
struct inode *inode;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2022-03-22 14:41:03 -07:00
|
|
|
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (!ei)
|
|
|
|
|
return NULL;
|
2010-05-16 10:46:25 -04:00
|
|
|
|
|
|
|
|
ei->root = NULL;
|
|
|
|
|
ei->generation = 0;
|
2007-08-10 16:22:09 -04:00
|
|
|
ei->last_trans = 0;
|
2009-10-13 13:21:08 -04:00
|
|
|
ei->last_sub_trans = 0;
|
2008-09-05 16:13:11 -04:00
|
|
|
ei->logged_trans = 0;
|
2010-05-16 10:46:25 -04:00
|
|
|
ei->delalloc_bytes = 0;
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
ei->new_delalloc_bytes = 0;
|
2014-07-03 18:22:07 +08:00
|
|
|
ei->defrag_bytes = 0;
|
2010-05-16 10:46:25 -04:00
|
|
|
ei->disk_i_size = 0;
|
|
|
|
|
ei->flags = 0;
|
btrfs: add ro compat flags to inodes
Currently, inode flags are fully backwards incompatible in btrfs. If we
introduce a new inode flag, then tree-checker will detect it and fail.
This can even cause us to fail to mount entirely. To make it possible to
introduce new flags which can be read-only compatible, like VERITY, we
add new ro flags to btrfs without treating them quite so harshly in
tree-checker. A read-only file system can survive an unexpected flag,
and can be mounted.
As for the implementation, it unfortunately gets a little complicated.
The on-disk representation of the inode, btrfs_inode_item, has an __le64
for flags but the in-memory representation, btrfs_inode, uses a u32.
David Sterba had the nice idea that we could reclaim those wasted 32 bits
on disk and use them for the new ro_compat flags.
It turns out that the tree-checker code which checks for unknown flags
is broken, and ignores the upper 32 bits we are hoping to use. The issue
is that the flags use the literal 1 rather than 1ULL, so the flags are
signed ints, and one of them is specifically (1 << 31). As a result, the
mask which ORs the flags is a negative integer on machines where int is
32 bit twos complement. When tree-checker evaluates the expression:
btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)
The mask is something like 0x80000abc, which gets promoted to u64 with
sign extension to 0xffffffff80000abc. Negating that 64 bit mask leaves
all the upper bits zeroed, and we can't detect unexpected flags.
This suggests that we can't use those bits after all. Luckily, we have
good reason to believe that they are zero anyway. Inode flags are
metadata, which is always checksummed, so any bit flips that would
introduce 1s would cause a checksum failure anyway (excluding the
improbable case of the checksum getting corrupted exactly badly).
Further, unless the 1 << 31 flag is used, the cast to u64 of the 32 bit
inode flag should preserve its value and not add leading zeroes
(at least for twos complement). The only place that flag
(BTRFS_INODE_ROOT_ITEM_INIT) is used is in a special inode embedded in
the root item, and indeed for that inode we see 0xffffffff80000000 as
the flags on disk. However, that inode is never seen by tree checker,
nor is it used in a context where verity might be meaningful.
Theoretically, a future ro flag might cause trouble on that inode, so we
should proactively clean up that mess before it does.
With the introduction of the new ro flags, keep two separate unsigned
masks and check them against the appropriate u32. Since we no longer run
afoul of sign extension, this also stops writing out 0xffffffff80000000
in root_item inodes going forward.
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-06-30 13:01:48 -07:00
|
|
|
ei->ro_flags = 0;
|
2011-08-04 10:25:02 -04:00
|
|
|
ei->csum_bytes = 0;
|
2010-05-16 10:46:25 -04:00
|
|
|
ei->index_cnt = (u64)-1;
|
2013-12-26 13:07:06 +08:00
|
|
|
ei->dir_index = 0;
|
2010-05-16 10:46:25 -04:00
|
|
|
ei->last_unlink_trans = 0;
|
btrfs: reduce contention on log trees when logging checksums
The possibility of extents being shared (through clone and deduplication
operations) requires special care when logging data checksums, to avoid
having a log tree with different checksum items that cover ranges which
overlap (which resulted in missing checksums after replaying a log tree).
Such problems were fixed in the past by the following commits:
commit 40e046acbd2f ("Btrfs: fix missing data checksums after replaying a
log tree")
commit e289f03ea79b ("btrfs: fix corrupt log due to concurrent fsync of
inodes with shared extents")
Test case generic/588 exercises the scenario solved by the first commit
(purely sequential and deterministic) while test case generic/457 often
triggered the case fixed by the second commit (not deterministic, requires
specific timings under concurrency).
The problems were addressed by deleting, from the log tree, any existing
checksums before logging the new ones. And also by doing the deletion and
logging of the cheksums while locking the checksum range in an extent io
tree (root->log_csum_range), to deal with the case where we have concurrent
fsyncs against files with shared extents.
That however causes more contention on the leaves of a log tree where we
store checksums (and all the nodes in the paths leading to them), even
when we do not have shared extents, or all the shared extents were created
by past transactions. It also adds a bit of contention on the spin lock of
the log_csums_range extent io tree of the log root.
This change adds a 'last_reflink_trans' field to the inode to keep track
of the last transaction where a new extent was shared between inodes
(through clone and deduplication operations). It is updated for both the
source and destination inodes of reflink operations whenever a new extent
(created in the current transaction) becomes shared by the inodes. This
field is kept in memory only, not persisted in the inode item, similar
to other existing fields (last_unlink_trans, logged_trans).
When logging checksums for an extent, if the value of 'last_reflink_trans'
is smaller then the current transaction's generation/id, we skip locking
the extent range and deletion of checksums from the log tree, since we
know we do not have new shared extents. This reduces contention on the
log tree's leaves where checksums are stored.
The following script, which uses fio, was used to measure the impact of
this change:
$ cat test-fsync.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 3 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=write
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=64k
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The tests were performed for different numbers of jobs, file sizes and
fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has
12 cores, with cpu governance set to performance mode on all cores), 16GiB
of ram (the host has 64GiB) and using a NVMe device directly (without an
intermediary filesystem in the host). While running the tests, the host
was not used for anything else, to avoid disturbing the tests.
The obtained results were the following (the last line of fio's output was
pasted). Starting with 16 jobs is where a significant difference is
observable in this particular setup and hardware (differences highlighted
below). The very small differences for tests with less than 16 jobs are
possibly just noise and random.
**** 1 job, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=23.8MiB/s (24.9MB/s), 23.8MiB/s-23.8MiB/s (24.9MB/s-24.9MB/s), io=1024MiB (1074MB), run=43075-43075msec
after this change:
WRITE: bw=24.4MiB/s (25.6MB/s), 24.4MiB/s-24.4MiB/s (25.6MB/s-25.6MB/s), io=1024MiB (1074MB), run=41938-41938msec
**** 2 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54351-54351msec
after this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.6MiB/s-37.6MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54428-54428msec
**** 4 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=67.5MiB/s (70.8MB/s), 67.5MiB/s-67.5MiB/s (70.8MB/s-70.8MB/s), io=4096MiB (4295MB), run=60669-60669msec
after this change:
WRITE: bw=68.6MiB/s (71.0MB/s), 68.6MiB/s-68.6MiB/s (71.0MB/s-71.0MB/s), io=4096MiB (4295MB), run=59678-59678msec
**** 8 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=128MiB/s (134MB/s), 128MiB/s-128MiB/s (134MB/s-134MB/s), io=8192MiB (8590MB), run=64048-64048msec
after this change:
WRITE: bw=129MiB/s (135MB/s), 129MiB/s-129MiB/s (135MB/s-135MB/s), io=8192MiB (8590MB), run=63405-63405msec
**** 16 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=78.5MiB/s (82.3MB/s), 78.5MiB/s-78.5MiB/s (82.3MB/s-82.3MB/s), io=16.0GiB (17.2GB), run=208676-208676msec
after this change:
WRITE: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s), io=16.0GiB (17.2GB), run=149295-149295msec
(+40.1% throughput, -28.5% runtime)
**** 32 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=58.8MiB/s (61.7MB/s), 58.8MiB/s-58.8MiB/s (61.7MB/s-61.7MB/s), io=32.0GiB (34.4GB), run=557134-557134msec
after this change:
WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430550-430550msec
(+29.4% throughput, -22.7% runtime)
**** 64 jobs, file size 512M, fsync frequency 1 ****
before this change:
WRITE: bw=65.8MiB/s (68.0MB/s), 65.8MiB/s-65.8MiB/s (68.0MB/s-68.0MB/s), io=32.0GiB (34.4GB), run=498055-498055msec
after this change:
WRITE: bw=85.1MiB/s (89.2MB/s), 85.1MiB/s-85.1MiB/s (89.2MB/s-89.2MB/s), io=32.0GiB (34.4GB), run=385116-385116msec
(+29.3% throughput, -22.7% runtime)
**** 128 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=54.7MiB/s (57.3MB/s), 54.7MiB/s-54.7MiB/s (57.3MB/s-57.3MB/s), io=32.0GiB (34.4GB), run=599373-599373msec
after this change:
WRITE: bw=121MiB/s (126MB/s), 121MiB/s-121MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=271907-271907msec
(+121.2% throughput, -54.6% runtime)
**** 256 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=69.2MiB/s (72.5MB/s), 69.2MiB/s-69.2MiB/s (72.5MB/s-72.5MB/s), io=64.0GiB (68.7GB), run=947536-947536msec
after this change:
WRITE: bw=121MiB/s (127MB/s), 121MiB/s-121MiB/s (127MB/s-127MB/s), io=64.0GiB (68.7GB), run=541916-541916msec
(+74.9% throughput, -42.8% runtime)
**** 512 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=85.4MiB/s (89.5MB/s), 85.4MiB/s-85.4MiB/s (89.5MB/s-89.5MB/s), io=64.0GiB (68.7GB), run=767734-767734msec
after this change:
WRITE: bw=141MiB/s (147MB/s), 141MiB/s-141MiB/s (147MB/s-147MB/s), io=64.0GiB (68.7GB), run=466022-466022msec
(+65.1% throughput, -39.3% runtime)
**** 1024 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=115MiB/s (120MB/s), 115MiB/s-115MiB/s (120MB/s-120MB/s), io=128GiB (137GB), run=1143775-1143775msec
after this change:
WRITE: bw=171MiB/s (180MB/s), 171MiB/s-171MiB/s (180MB/s-180MB/s), io=128GiB (137GB), run=764843-764843msec
(+48.7% throughput, -33.1% runtime)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-15 12:30:43 +01:00
|
|
|
ei->last_reflink_trans = 0;
|
2012-08-29 01:07:55 -06:00
|
|
|
ei->last_log_commit = 0;
|
2010-05-16 10:46:25 -04:00
|
|
|
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_lock_init(&ei->lock);
|
|
|
|
|
ei->outstanding_extents = 0;
|
2017-10-19 14:15:57 -04:00
|
|
|
if (sb->s_magic != BTRFS_TEST_MAGIC)
|
|
|
|
|
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
|
|
|
|
|
BTRFS_BLOCK_RSV_DELALLOC);
|
2012-05-23 14:13:11 -04:00
|
|
|
ei->runtime_flags = 0;
|
2017-07-17 19:17:20 +02:00
|
|
|
ei->prop_compress = BTRFS_COMPRESS_NONE;
|
2017-07-17 19:41:31 +02:00
|
|
|
ei->defrag_compress = BTRFS_COMPRESS_NONE;
|
2010-05-16 10:46:25 -04:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
ei->delayed_node = NULL;
|
|
|
|
|
|
2012-07-04 12:48:07 +05:30
|
|
|
ei->i_otime.tv_sec = 0;
|
|
|
|
|
ei->i_otime.tv_nsec = 0;
|
|
|
|
|
|
2010-05-16 10:46:25 -04:00
|
|
|
inode = &ei->vfs_inode;
|
2011-04-21 00:34:43 +02:00
|
|
|
extent_map_tree_init(&ei->extent_tree);
|
2019-03-01 10:47:59 +08:00
|
|
|
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
|
|
|
|
|
extent_io_tree_init(fs_info, &ei->io_failure_tree,
|
|
|
|
|
IO_TREE_INODE_IO_FAILURE, inode);
|
2020-01-17 09:02:21 -05:00
|
|
|
extent_io_tree_init(fs_info, &ei->file_extent_tree,
|
|
|
|
|
IO_TREE_INODE_FILE_EXTENT, inode);
|
2019-03-11 15:58:30 +01:00
|
|
|
ei->io_tree.track_uptodate = true;
|
|
|
|
|
ei->io_failure_tree.track_uptodate = true;
|
2012-11-16 13:56:32 -05:00
|
|
|
atomic_set(&ei->sync_writers, 0);
|
2010-05-16 10:46:25 -04:00
|
|
|
mutex_init(&ei->log_mutex);
|
2008-07-17 12:53:50 -04:00
|
|
|
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
|
2010-05-16 10:46:25 -04:00
|
|
|
INIT_LIST_HEAD(&ei->delalloc_inodes);
|
2015-11-19 14:15:51 +01:00
|
|
|
INIT_LIST_HEAD(&ei->delayed_iput);
|
2010-05-16 10:46:25 -04:00
|
|
|
RB_CLEAR_NODE(&ei->rb_node);
|
2021-02-10 17:14:33 -05:00
|
|
|
init_rwsem(&ei->i_mmap_lock);
|
2010-05-16 10:46:25 -04:00
|
|
|
|
|
|
|
|
return inode;
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2013-10-11 14:44:09 -04:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
|
void btrfs_test_destroy_inode(struct inode *inode)
|
|
|
|
|
{
|
2017-02-20 13:50:45 +02:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
|
2013-10-11 14:44:09 -04:00
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2019-04-10 15:14:41 -04:00
|
|
|
void btrfs_free_inode(struct inode *inode)
|
2011-01-07 17:49:49 +11:00
|
|
|
{
|
|
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-18 12:15:49 +03:00
|
|
|
void btrfs_destroy_inode(struct inode *vfs_inode)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2008-07-17 12:53:50 -04:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2020-09-18 12:15:49 +03:00
|
|
|
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
2022-07-25 15:11:59 -07:00
|
|
|
bool freespace_inode;
|
2009-03-31 13:27:11 -04:00
|
|
|
|
2020-09-18 12:15:49 +03:00
|
|
|
WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
|
|
|
|
|
WARN_ON(vfs_inode->i_data.nrpages);
|
|
|
|
|
WARN_ON(inode->block_rsv.reserved);
|
|
|
|
|
WARN_ON(inode->block_rsv.size);
|
|
|
|
|
WARN_ON(inode->outstanding_extents);
|
btrfs: keep track of the last logged keys when logging a directory
After the first time we log a directory in the current transaction, for
each directory item in a changed leaf of the subvolume tree, we have to
check if we previously logged the item, in order to overwrite it in case
its data changed or skip it in case its data hasn't changed.
Checking if we have logged each item before not only wastes times, but it
also adds lock contention on the log tree. So in order to minimize the
number of times we do such checks, keep track of the offset of the last
key we logged for a directory and, on the next time we log the directory,
skip the checks for any new keys that have an offset greater than the
offset we have previously saved. This is specially effective for index
keys, because the offset for these keys comes from a monotonically
increasing counter.
This patch is part of a patchset comprised of the following 5 patches:
btrfs: remove root argument from btrfs_log_inode() and its callees
btrfs: remove redundant log root assignment from log_dir_items()
btrfs: factor out the copying loop of dir items from log_dir_items()
btrfs: insert items in batches when logging a directory when possible
btrfs: keep track of the last logged keys when logging a directory
This is patch 5/5.
The following test was used on a non-debug kernel to measure the impact
it has on a directory fsync:
$ cat test-dir-fsync.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_NEW_FILES=100000
NUM_FILE_DELETES=1000
mkfs.btrfs -f $DEV
mount -o ssd $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
# fsync the directory, this will log the new dir items and the inodes
# they point to, because these are new inodes.
start=$(date +%s%N)
xfs_io -c "fsync" $MNT/testdir
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"
# sync to force transaction commit and wipeout the log.
sync
del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
rm -f $MNT/testdir/file_$i
done
# fsync the directory, this will only log dir items, there are no
# dentries pointing to new inodes.
start=$(date +%s%N)
xfs_io -c "fsync" $MNT/testdir
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"
umount $MNT
Test results with NUM_NEW_FILES set to 100 000 and 1 000 000:
**** before patchset, 100 000 files, 1000 deletes ****
dir fsync took 848 ms after adding 100000 files
dir fsync took 175 ms after deleting 1000 files
**** after patchset, 100 000 files, 1000 deletes ****
dir fsync took 758 ms after adding 100000 files (-11.2%)
dir fsync took 63 ms after deleting 1000 files (-94.1%)
**** before patchset, 1 000 000 files, 1000 deletes ****
dir fsync took 9945 ms after adding 1000000 files
dir fsync took 473 ms after deleting 1000 files
**** after patchset, 1 000 000 files, 1000 deletes ****
dir fsync took 8677 ms after adding 1000000 files (-13.6%)
dir fsync took 146 ms after deleting 1000 files (-105.6%)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-09-16 11:32:14 +01:00
|
|
|
if (!S_ISDIR(vfs_inode->i_mode)) {
|
|
|
|
|
WARN_ON(inode->delalloc_bytes);
|
|
|
|
|
WARN_ON(inode->new_delalloc_bytes);
|
|
|
|
|
}
|
2020-09-18 12:15:49 +03:00
|
|
|
WARN_ON(inode->csum_bytes);
|
|
|
|
|
WARN_ON(inode->defrag_bytes);
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2009-11-11 15:53:34 -05:00
|
|
|
/*
|
|
|
|
|
* This can happen where we create an inode, but somebody else also
|
|
|
|
|
* created the same inode and we need to destroy the one we already
|
|
|
|
|
* created.
|
|
|
|
|
*/
|
|
|
|
|
if (!root)
|
2019-04-10 15:14:41 -04:00
|
|
|
return;
|
2009-11-11 15:53:34 -05:00
|
|
|
|
2022-07-25 15:11:59 -07:00
|
|
|
/*
|
|
|
|
|
* If this is a free space inode do not take the ordered extents lockdep
|
|
|
|
|
* map.
|
|
|
|
|
*/
|
|
|
|
|
freespace_inode = btrfs_is_free_space_inode(inode);
|
|
|
|
|
|
2009-01-05 21:25:51 -05:00
|
|
|
while (1) {
|
2020-09-18 12:15:49 +03:00
|
|
|
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
|
2008-07-17 12:53:50 -04:00
|
|
|
if (!ordered)
|
|
|
|
|
break;
|
|
|
|
|
else {
|
2020-09-18 12:15:49 +03:00
|
|
|
btrfs_err(root->fs_info,
|
2016-09-20 10:05:00 -04:00
|
|
|
"found ordered extent %llu %llu on inode cleanup",
|
2019-12-02 17:34:19 -08:00
|
|
|
ordered->file_offset, ordered->num_bytes);
|
2022-07-25 15:11:59 -07:00
|
|
|
|
|
|
|
|
if (!freespace_inode)
|
|
|
|
|
btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
|
|
|
|
|
|
2020-09-18 12:15:50 +03:00
|
|
|
btrfs_remove_ordered_extent(inode, ordered);
|
2008-07-17 12:53:50 -04:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-09-18 12:15:49 +03:00
|
|
|
btrfs_qgroup_check_reserved_leak(inode);
|
|
|
|
|
inode_tree_del(inode);
|
|
|
|
|
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
|
|
|
|
|
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
|
|
|
|
|
btrfs_put_root(inode->root);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2010-06-07 13:43:19 -04:00
|
|
|
int btrfs_drop_inode(struct inode *inode)
|
2009-09-21 16:00:26 -04:00
|
|
|
{
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-06-07 13:43:19 -04:00
|
|
|
|
2013-06-06 09:56:34 +00:00
|
|
|
if (root == NULL)
|
|
|
|
|
return 1;
|
|
|
|
|
|
Btrfs: fix cleaner thread not working with inode cache option
Right now inode cache inode is treated as the same as space cache
inode, ie. keep inode in memory till putting super.
But this leads to an awkward situation.
If we're going to delete a snapshot/subvolume, btrfs will not
actually delete it and return free space, but will add it to dead
roots list until the last inode on this snap/subvol being destroyed.
Then we'll fetch deleted roots and cleanup them via cleaner thread.
So here is the problem, if we enable inode cache option, each
snap/subvol has a cached inode which is used to store inode allcation
information. And this cache inode will be kept in memory, as the above
said. So with inode cache, snap/subvol can only be added into
dead roots list during freeing roots stage in umount, so that we can
ONLY get space back after another remount(we cleanup dead roots on mount).
But the real thing is we'll no more use the snap/subvol if we mark it
deleted, so we can safely iput its cache inode when we delete snap/subvol.
Another thing is that we need to change the rules of droping inode, we
don't keep snap/subvol's cache inode in memory till end so that we can
add snap/subvol into dead roots list in time.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 14:10:23 +00:00
|
|
|
/* the snap/subvol tree is on deleting */
|
2013-09-05 16:58:43 +02:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
2010-06-07 13:43:19 -04:00
|
|
|
return 1;
|
2009-09-21 16:00:26 -04:00
|
|
|
else
|
2010-06-07 13:43:19 -04:00
|
|
|
return generic_drop_inode(inode);
|
2009-09-21 16:00:26 -04:00
|
|
|
}
|
|
|
|
|
|
2008-07-30 16:54:26 -04:00
|
|
|
static void init_once(void *foo)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2022-03-31 03:34:08 -07:00
|
|
|
struct btrfs_inode *ei = foo;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
inode_init_once(&ei->vfs_inode);
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-19 17:24:18 +01:00
|
|
|
void __cold btrfs_destroy_cachep(void)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2012-09-26 11:33:07 +10:00
|
|
|
/*
|
|
|
|
|
* Make sure all delayed rcu free inodes are flushed before we
|
|
|
|
|
* destroy cache.
|
|
|
|
|
*/
|
|
|
|
|
rcu_barrier();
|
2022-05-05 15:11:15 -05:00
|
|
|
bioset_exit(&btrfs_dio_bioset);
|
2016-01-29 21:36:35 +08:00
|
|
|
kmem_cache_destroy(btrfs_inode_cachep);
|
|
|
|
|
kmem_cache_destroy(btrfs_trans_handle_cachep);
|
|
|
|
|
kmem_cache_destroy(btrfs_path_cachep);
|
|
|
|
|
kmem_cache_destroy(btrfs_free_space_cachep);
|
btrfs: fix allocation of free space cache v1 bitmap pages
Various notifications of type "BUG kmalloc-4096 () : Redzone
overwritten" have been observed recently in various parts of the kernel.
After some time, it has been made a relation with the use of BTRFS
filesystem and with SLUB_DEBUG turned on.
[ 22.809700] BUG kmalloc-4096 (Tainted: G W ): Redzone overwritten
[ 22.810286] INFO: 0xbe1a5921-0xfbfc06cd. First byte 0x0 instead of 0xcc
[ 22.810866] INFO: Allocated in __load_free_space_cache+0x588/0x780 [btrfs] age=22 cpu=0 pid=224
[ 22.811193] __slab_alloc.constprop.26+0x44/0x70
[ 22.811345] kmem_cache_alloc_trace+0xf0/0x2ec
[ 22.811588] __load_free_space_cache+0x588/0x780 [btrfs]
[ 22.811848] load_free_space_cache+0xf4/0x1b0 [btrfs]
[ 22.812090] cache_block_group+0x1d0/0x3d0 [btrfs]
[ 22.812321] find_free_extent+0x680/0x12a4 [btrfs]
[ 22.812549] btrfs_reserve_extent+0xec/0x220 [btrfs]
[ 22.812785] btrfs_alloc_tree_block+0x178/0x5f4 [btrfs]
[ 22.813032] __btrfs_cow_block+0x150/0x5d4 [btrfs]
[ 22.813262] btrfs_cow_block+0x194/0x298 [btrfs]
[ 22.813484] commit_cowonly_roots+0x44/0x294 [btrfs]
[ 22.813718] btrfs_commit_transaction+0x63c/0xc0c [btrfs]
[ 22.813973] close_ctree+0xf8/0x2a4 [btrfs]
[ 22.814107] generic_shutdown_super+0x80/0x110
[ 22.814250] kill_anon_super+0x18/0x30
[ 22.814437] btrfs_kill_super+0x18/0x90 [btrfs]
[ 22.814590] INFO: Freed in proc_cgroup_show+0xc0/0x248 age=41 cpu=0 pid=83
[ 22.814841] proc_cgroup_show+0xc0/0x248
[ 22.814967] proc_single_show+0x54/0x98
[ 22.815086] seq_read+0x278/0x45c
[ 22.815190] __vfs_read+0x28/0x17c
[ 22.815289] vfs_read+0xa8/0x14c
[ 22.815381] ksys_read+0x50/0x94
[ 22.815475] ret_from_syscall+0x0/0x38
Commit 69d2480456d1 ("btrfs: use copy_page for copying pages instead of
memcpy") changed the way bitmap blocks are copied. But allthough bitmaps
have the size of a page, they were allocated with kzalloc().
Most of the time, kzalloc() allocates aligned blocks of memory, so
copy_page() can be used. But when some debug options like SLAB_DEBUG are
activated, kzalloc() may return unaligned pointer.
On powerpc, memcpy(), copy_page() and other copying functions use
'dcbz' instruction which provides an entire zeroed cacheline to avoid
memory read when the intention is to overwrite a full line. Functions
like memcpy() are writen to care about partial cachelines at the start
and end of the destination, but copy_page() assumes it gets pages. As
pages are naturally cache aligned, copy_page() doesn't care about
partial lines. This means that when copy_page() is called with a
misaligned pointer, a few leading bytes are zeroed.
To fix it, allocate bitmaps through kmem_cache instead of using kzalloc()
The cache pool is created with PAGE_SIZE alignment constraint.
Reported-by: Erhard F. <erhard_f@mailbox.org>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204371
Fixes: 69d2480456d1 ("btrfs: use copy_page for copying pages instead of memcpy")
Cc: stable@vger.kernel.org # 4.19+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: David Sterba <dsterba@suse.com>
[ rename to btrfs_free_space_bitmap ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-21 15:05:55 +00:00
|
|
|
kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
|
|
|
|
|
2017-11-02 17:21:50 -06:00
|
|
|
int __init btrfs_init_cachep(void)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2012-09-07 03:00:48 -06:00
|
|
|
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
|
2009-04-13 15:33:09 +02:00
|
|
|
sizeof(struct btrfs_inode), 0,
|
2016-01-14 15:18:21 -08:00
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
|
|
|
|
|
init_once);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (!btrfs_inode_cachep)
|
|
|
|
|
goto fail;
|
2009-04-13 15:33:09 +02:00
|
|
|
|
2012-09-07 03:00:48 -06:00
|
|
|
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
|
2009-04-13 15:33:09 +02:00
|
|
|
sizeof(struct btrfs_trans_handle), 0,
|
2016-06-23 21:17:08 +03:00
|
|
|
SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (!btrfs_trans_handle_cachep)
|
|
|
|
|
goto fail;
|
2009-04-13 15:33:09 +02:00
|
|
|
|
2012-09-07 03:00:48 -06:00
|
|
|
btrfs_path_cachep = kmem_cache_create("btrfs_path",
|
2009-04-13 15:33:09 +02:00
|
|
|
sizeof(struct btrfs_path), 0,
|
2016-06-23 21:17:08 +03:00
|
|
|
SLAB_MEM_SPREAD, NULL);
|
2007-06-12 06:35:45 -04:00
|
|
|
if (!btrfs_path_cachep)
|
|
|
|
|
goto fail;
|
2009-04-13 15:33:09 +02:00
|
|
|
|
2012-09-07 03:00:48 -06:00
|
|
|
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
|
2011-01-28 17:05:48 -05:00
|
|
|
sizeof(struct btrfs_free_space), 0,
|
2016-06-23 21:17:08 +03:00
|
|
|
SLAB_MEM_SPREAD, NULL);
|
2011-01-28 17:05:48 -05:00
|
|
|
if (!btrfs_free_space_cachep)
|
|
|
|
|
goto fail;
|
|
|
|
|
|
btrfs: fix allocation of free space cache v1 bitmap pages
Various notifications of type "BUG kmalloc-4096 () : Redzone
overwritten" have been observed recently in various parts of the kernel.
After some time, it has been made a relation with the use of BTRFS
filesystem and with SLUB_DEBUG turned on.
[ 22.809700] BUG kmalloc-4096 (Tainted: G W ): Redzone overwritten
[ 22.810286] INFO: 0xbe1a5921-0xfbfc06cd. First byte 0x0 instead of 0xcc
[ 22.810866] INFO: Allocated in __load_free_space_cache+0x588/0x780 [btrfs] age=22 cpu=0 pid=224
[ 22.811193] __slab_alloc.constprop.26+0x44/0x70
[ 22.811345] kmem_cache_alloc_trace+0xf0/0x2ec
[ 22.811588] __load_free_space_cache+0x588/0x780 [btrfs]
[ 22.811848] load_free_space_cache+0xf4/0x1b0 [btrfs]
[ 22.812090] cache_block_group+0x1d0/0x3d0 [btrfs]
[ 22.812321] find_free_extent+0x680/0x12a4 [btrfs]
[ 22.812549] btrfs_reserve_extent+0xec/0x220 [btrfs]
[ 22.812785] btrfs_alloc_tree_block+0x178/0x5f4 [btrfs]
[ 22.813032] __btrfs_cow_block+0x150/0x5d4 [btrfs]
[ 22.813262] btrfs_cow_block+0x194/0x298 [btrfs]
[ 22.813484] commit_cowonly_roots+0x44/0x294 [btrfs]
[ 22.813718] btrfs_commit_transaction+0x63c/0xc0c [btrfs]
[ 22.813973] close_ctree+0xf8/0x2a4 [btrfs]
[ 22.814107] generic_shutdown_super+0x80/0x110
[ 22.814250] kill_anon_super+0x18/0x30
[ 22.814437] btrfs_kill_super+0x18/0x90 [btrfs]
[ 22.814590] INFO: Freed in proc_cgroup_show+0xc0/0x248 age=41 cpu=0 pid=83
[ 22.814841] proc_cgroup_show+0xc0/0x248
[ 22.814967] proc_single_show+0x54/0x98
[ 22.815086] seq_read+0x278/0x45c
[ 22.815190] __vfs_read+0x28/0x17c
[ 22.815289] vfs_read+0xa8/0x14c
[ 22.815381] ksys_read+0x50/0x94
[ 22.815475] ret_from_syscall+0x0/0x38
Commit 69d2480456d1 ("btrfs: use copy_page for copying pages instead of
memcpy") changed the way bitmap blocks are copied. But allthough bitmaps
have the size of a page, they were allocated with kzalloc().
Most of the time, kzalloc() allocates aligned blocks of memory, so
copy_page() can be used. But when some debug options like SLAB_DEBUG are
activated, kzalloc() may return unaligned pointer.
On powerpc, memcpy(), copy_page() and other copying functions use
'dcbz' instruction which provides an entire zeroed cacheline to avoid
memory read when the intention is to overwrite a full line. Functions
like memcpy() are writen to care about partial cachelines at the start
and end of the destination, but copy_page() assumes it gets pages. As
pages are naturally cache aligned, copy_page() doesn't care about
partial lines. This means that when copy_page() is called with a
misaligned pointer, a few leading bytes are zeroed.
To fix it, allocate bitmaps through kmem_cache instead of using kzalloc()
The cache pool is created with PAGE_SIZE alignment constraint.
Reported-by: Erhard F. <erhard_f@mailbox.org>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204371
Fixes: 69d2480456d1 ("btrfs: use copy_page for copying pages instead of memcpy")
Cc: stable@vger.kernel.org # 4.19+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: David Sterba <dsterba@suse.com>
[ rename to btrfs_free_space_bitmap ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-21 15:05:55 +00:00
|
|
|
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
|
|
|
|
|
PAGE_SIZE, PAGE_SIZE,
|
2021-03-15 15:18:24 +01:00
|
|
|
SLAB_MEM_SPREAD, NULL);
|
btrfs: fix allocation of free space cache v1 bitmap pages
Various notifications of type "BUG kmalloc-4096 () : Redzone
overwritten" have been observed recently in various parts of the kernel.
After some time, it has been made a relation with the use of BTRFS
filesystem and with SLUB_DEBUG turned on.
[ 22.809700] BUG kmalloc-4096 (Tainted: G W ): Redzone overwritten
[ 22.810286] INFO: 0xbe1a5921-0xfbfc06cd. First byte 0x0 instead of 0xcc
[ 22.810866] INFO: Allocated in __load_free_space_cache+0x588/0x780 [btrfs] age=22 cpu=0 pid=224
[ 22.811193] __slab_alloc.constprop.26+0x44/0x70
[ 22.811345] kmem_cache_alloc_trace+0xf0/0x2ec
[ 22.811588] __load_free_space_cache+0x588/0x780 [btrfs]
[ 22.811848] load_free_space_cache+0xf4/0x1b0 [btrfs]
[ 22.812090] cache_block_group+0x1d0/0x3d0 [btrfs]
[ 22.812321] find_free_extent+0x680/0x12a4 [btrfs]
[ 22.812549] btrfs_reserve_extent+0xec/0x220 [btrfs]
[ 22.812785] btrfs_alloc_tree_block+0x178/0x5f4 [btrfs]
[ 22.813032] __btrfs_cow_block+0x150/0x5d4 [btrfs]
[ 22.813262] btrfs_cow_block+0x194/0x298 [btrfs]
[ 22.813484] commit_cowonly_roots+0x44/0x294 [btrfs]
[ 22.813718] btrfs_commit_transaction+0x63c/0xc0c [btrfs]
[ 22.813973] close_ctree+0xf8/0x2a4 [btrfs]
[ 22.814107] generic_shutdown_super+0x80/0x110
[ 22.814250] kill_anon_super+0x18/0x30
[ 22.814437] btrfs_kill_super+0x18/0x90 [btrfs]
[ 22.814590] INFO: Freed in proc_cgroup_show+0xc0/0x248 age=41 cpu=0 pid=83
[ 22.814841] proc_cgroup_show+0xc0/0x248
[ 22.814967] proc_single_show+0x54/0x98
[ 22.815086] seq_read+0x278/0x45c
[ 22.815190] __vfs_read+0x28/0x17c
[ 22.815289] vfs_read+0xa8/0x14c
[ 22.815381] ksys_read+0x50/0x94
[ 22.815475] ret_from_syscall+0x0/0x38
Commit 69d2480456d1 ("btrfs: use copy_page for copying pages instead of
memcpy") changed the way bitmap blocks are copied. But allthough bitmaps
have the size of a page, they were allocated with kzalloc().
Most of the time, kzalloc() allocates aligned blocks of memory, so
copy_page() can be used. But when some debug options like SLAB_DEBUG are
activated, kzalloc() may return unaligned pointer.
On powerpc, memcpy(), copy_page() and other copying functions use
'dcbz' instruction which provides an entire zeroed cacheline to avoid
memory read when the intention is to overwrite a full line. Functions
like memcpy() are writen to care about partial cachelines at the start
and end of the destination, but copy_page() assumes it gets pages. As
pages are naturally cache aligned, copy_page() doesn't care about
partial lines. This means that when copy_page() is called with a
misaligned pointer, a few leading bytes are zeroed.
To fix it, allocate bitmaps through kmem_cache instead of using kzalloc()
The cache pool is created with PAGE_SIZE alignment constraint.
Reported-by: Erhard F. <erhard_f@mailbox.org>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204371
Fixes: 69d2480456d1 ("btrfs: use copy_page for copying pages instead of memcpy")
Cc: stable@vger.kernel.org # 4.19+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: David Sterba <dsterba@suse.com>
[ rename to btrfs_free_space_bitmap ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-21 15:05:55 +00:00
|
|
|
if (!btrfs_free_space_bitmap_cachep)
|
|
|
|
|
goto fail;
|
|
|
|
|
|
2022-05-05 15:11:15 -05:00
|
|
|
if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
|
|
|
|
|
offsetof(struct btrfs_dio_private, bio),
|
|
|
|
|
BIOSET_NEED_BVECS))
|
|
|
|
|
goto fail;
|
|
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
return 0;
|
|
|
|
|
fail:
|
|
|
|
|
btrfs_destroy_cachep();
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_getattr(struct user_namespace *mnt_userns,
|
|
|
|
|
const struct path *path, struct kstat *stat,
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
|
|
|
u32 request_mask, unsigned int flags)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2013-01-29 10:11:59 +00:00
|
|
|
u64 delalloc_bytes;
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
u64 inode_bytes;
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
|
|
|
struct inode *inode = d_inode(path->dentry);
|
2011-11-20 07:33:38 -05:00
|
|
|
u32 blocksize = inode->i_sb->s_blocksize;
|
2017-05-12 15:07:43 -07:00
|
|
|
u32 bi_flags = BTRFS_I(inode)->flags;
|
2021-06-30 13:01:49 -07:00
|
|
|
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
|
2017-05-12 15:07:43 -07:00
|
|
|
|
|
|
|
|
stat->result_mask |= STATX_BTIME;
|
|
|
|
|
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
|
|
|
|
|
stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
|
|
|
|
|
if (bi_flags & BTRFS_INODE_APPEND)
|
|
|
|
|
stat->attributes |= STATX_ATTR_APPEND;
|
|
|
|
|
if (bi_flags & BTRFS_INODE_COMPRESS)
|
|
|
|
|
stat->attributes |= STATX_ATTR_COMPRESSED;
|
|
|
|
|
if (bi_flags & BTRFS_INODE_IMMUTABLE)
|
|
|
|
|
stat->attributes |= STATX_ATTR_IMMUTABLE;
|
|
|
|
|
if (bi_flags & BTRFS_INODE_NODUMP)
|
|
|
|
|
stat->attributes |= STATX_ATTR_NODUMP;
|
2021-06-30 13:01:49 -07:00
|
|
|
if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
|
|
|
|
|
stat->attributes |= STATX_ATTR_VERITY;
|
2017-05-12 15:07:43 -07:00
|
|
|
|
|
|
|
|
stat->attributes_mask |= (STATX_ATTR_APPEND |
|
|
|
|
|
STATX_ATTR_COMPRESSED |
|
|
|
|
|
STATX_ATTR_IMMUTABLE |
|
|
|
|
|
STATX_ATTR_NODUMP);
|
2011-11-20 07:33:38 -05:00
|
|
|
|
2021-07-27 12:48:43 +02:00
|
|
|
generic_fillattr(mnt_userns, inode, stat);
|
2011-07-07 15:44:25 -04:00
|
|
|
stat->dev = BTRFS_I(inode)->root->anon_dev;
|
2013-01-29 10:11:59 +00:00
|
|
|
|
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 10:45:46 +01:00
|
|
|
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
inode_bytes = inode_get_bytes(inode);
|
2013-01-29 10:11:59 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
stat->blocks = (ALIGN(inode_bytes, blocksize) +
|
2013-01-29 10:11:59 +00:00
|
|
|
ALIGN(delalloc_bytes, blocksize)) >> 9;
|
2007-06-12 06:35:45 -04:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-17 15:23:38 +01:00
|
|
|
static int btrfs_rename_exchange(struct inode *old_dir,
|
|
|
|
|
struct dentry *old_dentry,
|
|
|
|
|
struct inode *new_dir,
|
|
|
|
|
struct dentry *new_dentry)
|
|
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
|
2016-03-17 15:23:38 +01:00
|
|
|
struct btrfs_trans_handle *trans;
|
2022-03-09 17:31:32 -08:00
|
|
|
unsigned int trans_num_items;
|
2016-03-17 15:23:38 +01:00
|
|
|
struct btrfs_root *root = BTRFS_I(old_dir)->root;
|
|
|
|
|
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
|
|
|
|
|
struct inode *new_inode = new_dentry->d_inode;
|
|
|
|
|
struct inode *old_inode = old_dentry->d_inode;
|
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
|
|
|
struct timespec64 ctime = current_time(old_inode);
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
struct btrfs_rename_ctx old_rename_ctx;
|
|
|
|
|
struct btrfs_rename_ctx new_rename_ctx;
|
2017-01-10 20:35:31 +02:00
|
|
|
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
|
|
|
|
|
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
|
2016-03-17 15:23:38 +01:00
|
|
|
u64 old_idx = 0;
|
|
|
|
|
u64 new_idx = 0;
|
|
|
|
|
int ret;
|
btrfs: do not commit logs and transactions during link and rename operations
Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we
started to commit logs, and fallback to transaction commits when we failed
to log the new names or commit the logs, after link and rename operations
when the target inodes (or their parents) were previously logged in the
current transaction. This was to avoid losing directories despite an
explicit fsync on them when they are ancestors of some inode that got a
new named logged, due to a link or rename operation. However that adds the
cost of starting IO and waiting for it to complete, which can cause higher
latencies for applications.
Instead of doing that, just make sure that when we log a new name for an
inode we don't mark any of its ancestors as logged, so that if any one
does an fsync against any of them, without doing any other change on them,
the fsync commits the log. This way we only pay the cost of a log commit
(or a transaction commit if something goes wrong or a new block group was
created) if the application explicitly asks to fsync any of the parent
directories.
Using dbench, which mixes several filesystems operations including renames,
revealed some significant latency gains. The following script that uses
dbench was used to test this:
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/btrfs
MOUNT_OPTIONS="-o ssd -o space_cache=v2"
MKFS_OPTIONS="-m single -d single"
THREADS=16
echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -t 300 -D $MNT $THREADS
umount $MNT
The test was run on bare metal, no virtualization, on a box with 12 cores
(Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel
configuration that is the default of typical distributions (debian in this
case), without debug options enabled (kasan, kmemleak, slub debug, debug
of page allocations, lock debugging, etc).
Results before this patch:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 10750455 0.011 155.088
Close 7896674 0.001 0.243
Rename 455222 2.158 1101.947
Unlink 2171189 0.067 121.638
Deltree 256 2.425 7.816
Mkdir 128 0.002 0.003
Qpathinfo 9744323 0.006 21.370
Qfileinfo 1707092 0.001 0.146
Qfsinfo 1786756 0.001 11.228
Sfileinfo 875612 0.003 21.263
Find 3767281 0.025 9.617
WriteX 5356924 0.011 211.390
ReadX 16852694 0.003 9.442
LockX 35008 0.002 0.119
UnlockX 35008 0.001 0.138
Flush 753458 4.252 1102.249
Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms
Results after this patch:
16 clients, after
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 11471098 0.012 448.281
Close 8426396 0.001 0.925
Rename 485746 0.123 267.183
Unlink 2316477 0.080 63.433
Deltree 288 2.830 11.144
Mkdir 144 0.003 0.010
Qpathinfo 10397420 0.006 10.288
Qfileinfo 1822039 0.001 0.169
Qfsinfo 1906497 0.002 14.039
Sfileinfo 934433 0.004 2.438
Find 4019879 0.026 10.200
WriteX 5718932 0.011 200.985
ReadX 17981671 0.003 10.036
LockX 37352 0.002 0.076
UnlockX 37352 0.001 0.109
Flush 804018 5.015 778.033
Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms
(+6.5% throughput, -29.4% max latency, -75.8% rename latency)
Test case generic/498 from fstests tests the scenario that the previously
mentioned commit fixed.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-11 12:43:48 +01:00
|
|
|
int ret2;
|
2021-05-19 14:04:21 -04:00
|
|
|
bool need_abort = false;
|
2016-03-17 15:23:38 +01:00
|
|
|
|
btrfs: prevent rename2 from exchanging a subvol with a directory from different parents
Cross-rename lacks a check when that would prevent exchanging a
directory and subvolume from different parent subvolume. This causes
data inconsistencies and is caught before commit by tree-checker,
turning the filesystem to read-only.
Calling the renameat2 with RENAME_EXCHANGE flags like
renameat2(AT_FDCWD, namesrc, AT_FDCWD, namedest, (1 << 1))
on two paths:
namesrc = dir1/subvol1/dir2
namedest = subvol2/subvol3
will cause key order problem with following write time tree-checker
report:
[1194842.307890] BTRFS critical (device loop1): corrupt leaf: root=5 block=27574272 slot=10 ino=258, invalid previous key objectid, have 257 expect 258
[1194842.322221] BTRFS info (device loop1): leaf 27574272 gen 8 total ptrs 11 free space 15444 owner 5
[1194842.331562] BTRFS info (device loop1): refs 2 lock_owner 0 current 26561
[1194842.338772] item 0 key (256 1 0) itemoff 16123 itemsize 160
[1194842.338793] inode generation 3 size 16 mode 40755
[1194842.338801] item 1 key (256 12 256) itemoff 16111 itemsize 12
[1194842.338809] item 2 key (256 84 2248503653) itemoff 16077 itemsize 34
[1194842.338817] dir oid 258 type 2
[1194842.338823] item 3 key (256 84 2363071922) itemoff 16043 itemsize 34
[1194842.338830] dir oid 257 type 2
[1194842.338836] item 4 key (256 96 2) itemoff 16009 itemsize 34
[1194842.338843] item 5 key (256 96 3) itemoff 15975 itemsize 34
[1194842.338852] item 6 key (257 1 0) itemoff 15815 itemsize 160
[1194842.338863] inode generation 6 size 8 mode 40755
[1194842.338869] item 7 key (257 12 256) itemoff 15801 itemsize 14
[1194842.338876] item 8 key (257 84 2505409169) itemoff 15767 itemsize 34
[1194842.338883] dir oid 256 type 2
[1194842.338888] item 9 key (257 96 2) itemoff 15733 itemsize 34
[1194842.338895] item 10 key (258 12 256) itemoff 15719 itemsize 14
[1194842.339163] BTRFS error (device loop1): block=27574272 write time tree block corruption detected
[1194842.339245] ------------[ cut here ]------------
[1194842.443422] WARNING: CPU: 6 PID: 26561 at fs/btrfs/disk-io.c:449 csum_one_extent_buffer+0xed/0x100 [btrfs]
[1194842.511863] CPU: 6 PID: 26561 Comm: kworker/u17:2 Not tainted 5.14.0-rc3-git+ #793
[1194842.511870] Hardware name: empty empty/S3993, BIOS PAQEX0-3 02/24/2008
[1194842.511876] Workqueue: btrfs-worker-high btrfs_work_helper [btrfs]
[1194842.511976] RIP: 0010:csum_one_extent_buffer+0xed/0x100 [btrfs]
[1194842.512068] RSP: 0018:ffffa2c284d77da0 EFLAGS: 00010282
[1194842.512074] RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffff928867bd9978
[1194842.512078] RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff928867bd9970
[1194842.512081] RBP: ffff92876b958000 R08: 0000000000000001 R09: 00000000000c0003
[1194842.512085] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[1194842.512088] R13: ffff92875f989f98 R14: 0000000000000000 R15: 0000000000000000
[1194842.512092] FS: 0000000000000000(0000) GS:ffff928867a00000(0000) knlGS:0000000000000000
[1194842.512095] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1194842.512099] CR2: 000055f5384da1f0 CR3: 0000000102fe4000 CR4: 00000000000006e0
[1194842.512103] Call Trace:
[1194842.512128] ? run_one_async_free+0x10/0x10 [btrfs]
[1194842.631729] btree_csum_one_bio+0x1ac/0x1d0 [btrfs]
[1194842.631837] run_one_async_start+0x18/0x30 [btrfs]
[1194842.631938] btrfs_work_helper+0xd5/0x1d0 [btrfs]
[1194842.647482] process_one_work+0x262/0x5e0
[1194842.647520] worker_thread+0x4c/0x320
[1194842.655935] ? process_one_work+0x5e0/0x5e0
[1194842.655946] kthread+0x135/0x160
[1194842.655953] ? set_kthread_struct+0x40/0x40
[1194842.655965] ret_from_fork+0x1f/0x30
[1194842.672465] irq event stamp: 1729
[1194842.672469] hardirqs last enabled at (1735): [<ffffffffbd1104f5>] console_trylock_spinning+0x185/0x1a0
[1194842.672477] hardirqs last disabled at (1740): [<ffffffffbd1104cc>] console_trylock_spinning+0x15c/0x1a0
[1194842.672482] softirqs last enabled at (1666): [<ffffffffbdc002e1>] __do_softirq+0x2e1/0x50a
[1194842.672491] softirqs last disabled at (1651): [<ffffffffbd08aab7>] __irq_exit_rcu+0xa7/0xd0
The corrupted data will not be written, and filesystem can be unmounted
and mounted again (all changes since the last commit will be lost).
Add the missing check for new_ino so that all non-subvolumes must reside
under the same parent subvolume. There's an exception allowing to
exchange two subvolumes from any parents as the directory representing a
subvolume is only a logical link and does not have any other structures
related to the parent subvolume, unlike files, directories etc, that
are always in the inode namespace of the parent subvolume.
Fixes: cdd1fedf8261 ("btrfs: add support for RENAME_EXCHANGE and RENAME_WHITEOUT")
CC: stable@vger.kernel.org # 4.7+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-08-06 14:26:24 +10:00
|
|
|
/*
|
|
|
|
|
* For non-subvolumes allow exchange only within one subvolume, in the
|
|
|
|
|
* same inode namespace. Two subvolumes (represented as directory) can
|
|
|
|
|
* be exchanged as they're a logical link and have a fixed inode number.
|
|
|
|
|
*/
|
|
|
|
|
if (root != dest &&
|
|
|
|
|
(old_ino != BTRFS_FIRST_FREE_OBJECTID ||
|
|
|
|
|
new_ino != BTRFS_FIRST_FREE_OBJECTID))
|
2016-03-17 15:23:38 +01:00
|
|
|
return -EXDEV;
|
|
|
|
|
|
|
|
|
|
/* close the race window with snapshot create/destroy ioctl */
|
2019-11-19 13:59:20 -05:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
|
|
|
|
|
new_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 18:54:23 -04:00
|
|
|
down_read(&fs_info->subvol_sem);
|
2016-03-17 15:23:38 +01:00
|
|
|
|
|
|
|
|
/*
|
2022-03-09 17:31:32 -08:00
|
|
|
* For each inode:
|
|
|
|
|
* 1 to remove old dir item
|
|
|
|
|
* 1 to remove old dir index
|
|
|
|
|
* 1 to add new dir item
|
|
|
|
|
* 1 to add new dir index
|
|
|
|
|
* 1 to update parent inode
|
|
|
|
|
*
|
|
|
|
|
* If the parents are the same, we only need to account for one
|
2016-03-17 15:23:38 +01:00
|
|
|
*/
|
2022-03-09 17:31:32 -08:00
|
|
|
trans_num_items = (old_dir == new_dir ? 9 : 10);
|
|
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
|
/*
|
|
|
|
|
* 1 to remove old root ref
|
|
|
|
|
* 1 to remove old root backref
|
|
|
|
|
* 1 to add new root ref
|
|
|
|
|
* 1 to add new root backref
|
|
|
|
|
*/
|
|
|
|
|
trans_num_items += 4;
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* 1 to update inode item
|
|
|
|
|
* 1 to remove old inode ref
|
|
|
|
|
* 1 to add new inode ref
|
|
|
|
|
*/
|
|
|
|
|
trans_num_items += 3;
|
|
|
|
|
}
|
|
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
|
trans_num_items += 4;
|
|
|
|
|
else
|
|
|
|
|
trans_num_items += 3;
|
|
|
|
|
trans = btrfs_start_transaction(root, trans_num_items);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
goto out_notrans;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-12 15:25:02 -05:00
|
|
|
if (dest != root) {
|
|
|
|
|
ret = btrfs_record_root_in_trans(trans, dest);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
|
|
|
|
}
|
2019-11-15 15:43:06 -05:00
|
|
|
|
2016-03-17 15:23:38 +01:00
|
|
|
/*
|
|
|
|
|
* We need to find a free sequence number both in the source and
|
|
|
|
|
* in the destination directory for the exchange.
|
|
|
|
|
*/
|
2017-02-20 13:50:33 +02:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
2017-02-20 13:50:33 +02:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
|
|
|
|
|
|
|
|
|
BTRFS_I(old_inode)->dir_index = 0ULL;
|
|
|
|
|
BTRFS_I(new_inode)->dir_index = 0ULL;
|
|
|
|
|
|
|
|
|
|
/* Reference for the source. */
|
|
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
|
/* force full log commit if subvolume involved. */
|
2019-03-20 13:28:05 +01:00
|
|
|
btrfs_set_log_full_commit(trans);
|
2016-03-17 15:23:38 +01:00
|
|
|
} else {
|
|
|
|
|
ret = btrfs_insert_inode_ref(trans, dest,
|
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
|
new_dentry->d_name.len,
|
|
|
|
|
old_ino,
|
2017-01-20 14:54:07 +01:00
|
|
|
btrfs_ino(BTRFS_I(new_dir)),
|
|
|
|
|
old_idx);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
2021-05-19 14:04:21 -04:00
|
|
|
need_abort = true;
|
2016-03-17 15:23:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* And now for the dest. */
|
|
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
|
/* force full log commit if subvolume involved. */
|
2019-03-20 13:28:05 +01:00
|
|
|
btrfs_set_log_full_commit(trans);
|
2016-03-17 15:23:38 +01:00
|
|
|
} else {
|
|
|
|
|
ret = btrfs_insert_inode_ref(trans, root,
|
|
|
|
|
old_dentry->d_name.name,
|
|
|
|
|
old_dentry->d_name.len,
|
|
|
|
|
new_ino,
|
2017-01-20 14:54:07 +01:00
|
|
|
btrfs_ino(BTRFS_I(old_dir)),
|
|
|
|
|
new_idx);
|
2021-05-19 14:04:21 -04:00
|
|
|
if (ret) {
|
|
|
|
|
if (need_abort)
|
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 15:23:38 +01:00
|
|
|
goto out_fail;
|
2021-05-19 14:04:21 -04:00
|
|
|
}
|
2016-03-17 15:23:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Update inode version and ctime/mtime. */
|
|
|
|
|
inode_inc_iversion(old_dir);
|
|
|
|
|
inode_inc_iversion(new_dir);
|
|
|
|
|
inode_inc_iversion(old_inode);
|
|
|
|
|
inode_inc_iversion(new_inode);
|
2022-06-21 18:40:48 +02:00
|
|
|
old_dir->i_mtime = ctime;
|
|
|
|
|
old_dir->i_ctime = ctime;
|
|
|
|
|
new_dir->i_mtime = ctime;
|
|
|
|
|
new_dir->i_ctime = ctime;
|
2016-03-17 15:23:38 +01:00
|
|
|
old_inode->i_ctime = ctime;
|
|
|
|
|
new_inode->i_ctime = ctime;
|
|
|
|
|
|
|
|
|
|
if (old_dentry->d_parent != new_dentry->d_parent) {
|
2017-01-20 14:54:07 +01:00
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
|
|
|
|
|
BTRFS_I(old_inode), 1);
|
|
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
|
|
|
|
|
BTRFS_I(new_inode), 1);
|
2016-03-17 15:23:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* src is a subvolume */
|
|
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
2019-12-18 17:20:27 -05:00
|
|
|
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
|
2016-03-17 15:23:38 +01:00
|
|
|
} else { /* src is an inode */
|
2021-10-25 17:31:50 +01:00
|
|
|
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
|
2017-01-18 00:31:44 +02:00
|
|
|
BTRFS_I(old_dentry->d_inode),
|
2016-03-17 15:23:38 +01:00
|
|
|
old_dentry->d_name.name,
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
old_dentry->d_name.len,
|
|
|
|
|
&old_rename_ctx);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (!ret)
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
|
2016-03-17 15:23:38 +01:00
|
|
|
}
|
|
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 15:23:38 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* dest is a subvolume */
|
|
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
2019-12-18 17:20:27 -05:00
|
|
|
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
|
2016-03-17 15:23:38 +01:00
|
|
|
} else { /* dest is an inode */
|
2021-10-25 17:31:50 +01:00
|
|
|
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
|
2017-01-18 00:31:44 +02:00
|
|
|
BTRFS_I(new_dentry->d_inode),
|
2016-03-17 15:23:38 +01:00
|
|
|
new_dentry->d_name.name,
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
new_dentry->d_name.len,
|
|
|
|
|
&new_rename_ctx);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (!ret)
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
|
2016-03-17 15:23:38 +01:00
|
|
|
}
|
|
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 15:23:38 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 13:51:08 +02:00
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
|
2016-03-17 15:23:38 +01:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
|
new_dentry->d_name.len, 0, old_idx);
|
|
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 15:23:38 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 13:51:08 +02:00
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
|
2016-03-17 15:23:38 +01:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
|
old_dentry->d_name.len, 0, new_idx);
|
|
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 15:23:38 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (old_inode->i_nlink == 1)
|
|
|
|
|
BTRFS_I(old_inode)->dir_index = old_idx;
|
|
|
|
|
if (new_inode->i_nlink == 1)
|
|
|
|
|
BTRFS_I(new_inode)->dir_index = new_idx;
|
|
|
|
|
|
btrfs: stop doing unnecessary log updates during a rename
During a rename, we call __btrfs_unlink_inode(), which will call
btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(), in order
to remove an inode reference and a directory entry from the log. These
are necessary when __btrfs_unlink_inode() is called from the unlink path,
but not necessary when it's called from a rename context, because:
1) For the btrfs_del_inode_ref_in_log() call, it's pointless to delete the
inode reference related to the old name, because later in the rename
path we call btrfs_log_new_name(), which will drop all inode references
from the log and copy all inode references from the subvolume tree to
the log tree. So we are doing one unnecessary btree operation which
adds additional latency and lock contention in case there are other
tasks accessing the log tree;
2) For the btrfs_del_dir_entries_in_log() call, we are now doing the
equivalent at btrfs_log_new_name() since the previous patch in the
series, that has the subject "btrfs: avoid logging all directory
changes during renames". In fact, having __btrfs_unlink_inode() call
this function not only adds additional latency and lock contention due
to the extra btree operation, but also can make btrfs_log_new_name()
unnecessarily log a range item to track the deletion of the old name,
since it has no way to known that the directory entry related to the
old name was previously logged and already deleted by
__btrfs_unlink_inode() through its call to
btrfs_del_dir_entries_in_log().
So skip those calls at __btrfs_unlink_inode() when we are doing a rename.
Skipping them also allows us now to reduce the duration of time we are
pinning a log transaction during renames, which is always beneficial as
it's not delaying so much other tasks trying to sync the log tree, in
particular we end up not holding the log transaction pinned while adding
the new name (adding inode ref, directory entry, etc).
This change is part of a patchset comprised of the following patches:
1/5 btrfs: add helper to delete a dir entry from a log tree
2/5 btrfs: pass the dentry to btrfs_log_new_name() instead of the inode
3/5 btrfs: avoid logging all directory changes during renames
4/5 btrfs: stop doing unnecessary log updates during a rename
5/5 btrfs: avoid inode logging during rename and link when possible
Just like the previous patch in the series, "btrfs: avoid logging all
directory changes during renames", the following script mimics part of
what a package installation/upgrade with zypper does, which is basically
renaming a lot of files, in some directory under /usr, to a name with a
suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box a using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before patchset: 27399 ms
NUM_FILES=10000, after patches 1/5 to 3/5 applied: 9093 ms (-66.8%)
NUM_FILES=10000, after patches 1/5 to 4/5 applied: 9016 ms (-67.1%)
NUM_FILES=5000, before patchset: 9241 ms
NUM_FILES=5000, after patches 1/5 to 3/5 applied: 4642 ms (-49.8%)
NUM_FILES=5000, after patches 1/5 to 4/5 applied: 4553 ms (-50.7%)
NUM_FILES=2000, before patchset: 2550 ms
NUM_FILES=2000, after patches 1/5 to 3/5 applied: 1788 ms (-29.9%)
NUM_FILES=2000, after patches 1/5 to 4/5 applied: 1767 ms (-30.7%)
NUM_FILES=1000, before patchset: 1088 ms
NUM_FILES=1000, after patches 1/5 to 3/5 applied: 905 ms (-16.9%)
NUM_FILES=1000, after patches 1/5 to 4/5 applied: 883 ms (-18.8%)
The next patch in the series (5/5), also contains dbench results after
applying to whole patchset.
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:09 +00:00
|
|
|
/*
|
|
|
|
|
* Now pin the logs of the roots. We do it to ensure that no other task
|
|
|
|
|
* can sync the logs while we are in progress with the rename, because
|
|
|
|
|
* that could result in an inconsistency in case any of the inodes that
|
|
|
|
|
* are part of this rename operation were logged before.
|
|
|
|
|
*/
|
|
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
|
btrfs_pin_log_trans(root);
|
|
|
|
|
if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
|
btrfs_pin_log_trans(dest);
|
|
|
|
|
|
|
|
|
|
/* Do the log updates for all inodes. */
|
|
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
|
2022-01-20 11:00:07 +00:00
|
|
|
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
old_rename_ctx.index, new_dentry->d_parent);
|
btrfs: stop doing unnecessary log updates during a rename
During a rename, we call __btrfs_unlink_inode(), which will call
btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(), in order
to remove an inode reference and a directory entry from the log. These
are necessary when __btrfs_unlink_inode() is called from the unlink path,
but not necessary when it's called from a rename context, because:
1) For the btrfs_del_inode_ref_in_log() call, it's pointless to delete the
inode reference related to the old name, because later in the rename
path we call btrfs_log_new_name(), which will drop all inode references
from the log and copy all inode references from the subvolume tree to
the log tree. So we are doing one unnecessary btree operation which
adds additional latency and lock contention in case there are other
tasks accessing the log tree;
2) For the btrfs_del_dir_entries_in_log() call, we are now doing the
equivalent at btrfs_log_new_name() since the previous patch in the
series, that has the subject "btrfs: avoid logging all directory
changes during renames". In fact, having __btrfs_unlink_inode() call
this function not only adds additional latency and lock contention due
to the extra btree operation, but also can make btrfs_log_new_name()
unnecessarily log a range item to track the deletion of the old name,
since it has no way to known that the directory entry related to the
old name was previously logged and already deleted by
__btrfs_unlink_inode() through its call to
btrfs_del_dir_entries_in_log().
So skip those calls at __btrfs_unlink_inode() when we are doing a rename.
Skipping them also allows us now to reduce the duration of time we are
pinning a log transaction during renames, which is always beneficial as
it's not delaying so much other tasks trying to sync the log tree, in
particular we end up not holding the log transaction pinned while adding
the new name (adding inode ref, directory entry, etc).
This change is part of a patchset comprised of the following patches:
1/5 btrfs: add helper to delete a dir entry from a log tree
2/5 btrfs: pass the dentry to btrfs_log_new_name() instead of the inode
3/5 btrfs: avoid logging all directory changes during renames
4/5 btrfs: stop doing unnecessary log updates during a rename
5/5 btrfs: avoid inode logging during rename and link when possible
Just like the previous patch in the series, "btrfs: avoid logging all
directory changes during renames", the following script mimics part of
what a package installation/upgrade with zypper does, which is basically
renaming a lot of files, in some directory under /usr, to a name with a
suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box a using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before patchset: 27399 ms
NUM_FILES=10000, after patches 1/5 to 3/5 applied: 9093 ms (-66.8%)
NUM_FILES=10000, after patches 1/5 to 4/5 applied: 9016 ms (-67.1%)
NUM_FILES=5000, before patchset: 9241 ms
NUM_FILES=5000, after patches 1/5 to 3/5 applied: 4642 ms (-49.8%)
NUM_FILES=5000, after patches 1/5 to 4/5 applied: 4553 ms (-50.7%)
NUM_FILES=2000, before patchset: 2550 ms
NUM_FILES=2000, after patches 1/5 to 3/5 applied: 1788 ms (-29.9%)
NUM_FILES=2000, after patches 1/5 to 4/5 applied: 1767 ms (-30.7%)
NUM_FILES=1000, before patchset: 1088 ms
NUM_FILES=1000, after patches 1/5 to 3/5 applied: 905 ms (-16.9%)
NUM_FILES=1000, after patches 1/5 to 4/5 applied: 883 ms (-18.8%)
The next patch in the series (5/5), also contains dbench results after
applying to whole patchset.
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:09 +00:00
|
|
|
if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
|
2022-01-20 11:00:07 +00:00
|
|
|
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
new_rename_ctx.index, old_dentry->d_parent);
|
btrfs: stop doing unnecessary log updates during a rename
During a rename, we call __btrfs_unlink_inode(), which will call
btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(), in order
to remove an inode reference and a directory entry from the log. These
are necessary when __btrfs_unlink_inode() is called from the unlink path,
but not necessary when it's called from a rename context, because:
1) For the btrfs_del_inode_ref_in_log() call, it's pointless to delete the
inode reference related to the old name, because later in the rename
path we call btrfs_log_new_name(), which will drop all inode references
from the log and copy all inode references from the subvolume tree to
the log tree. So we are doing one unnecessary btree operation which
adds additional latency and lock contention in case there are other
tasks accessing the log tree;
2) For the btrfs_del_dir_entries_in_log() call, we are now doing the
equivalent at btrfs_log_new_name() since the previous patch in the
series, that has the subject "btrfs: avoid logging all directory
changes during renames". In fact, having __btrfs_unlink_inode() call
this function not only adds additional latency and lock contention due
to the extra btree operation, but also can make btrfs_log_new_name()
unnecessarily log a range item to track the deletion of the old name,
since it has no way to known that the directory entry related to the
old name was previously logged and already deleted by
__btrfs_unlink_inode() through its call to
btrfs_del_dir_entries_in_log().
So skip those calls at __btrfs_unlink_inode() when we are doing a rename.
Skipping them also allows us now to reduce the duration of time we are
pinning a log transaction during renames, which is always beneficial as
it's not delaying so much other tasks trying to sync the log tree, in
particular we end up not holding the log transaction pinned while adding
the new name (adding inode ref, directory entry, etc).
This change is part of a patchset comprised of the following patches:
1/5 btrfs: add helper to delete a dir entry from a log tree
2/5 btrfs: pass the dentry to btrfs_log_new_name() instead of the inode
3/5 btrfs: avoid logging all directory changes during renames
4/5 btrfs: stop doing unnecessary log updates during a rename
5/5 btrfs: avoid inode logging during rename and link when possible
Just like the previous patch in the series, "btrfs: avoid logging all
directory changes during renames", the following script mimics part of
what a package installation/upgrade with zypper does, which is basically
renaming a lot of files, in some directory under /usr, to a name with a
suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box a using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before patchset: 27399 ms
NUM_FILES=10000, after patches 1/5 to 3/5 applied: 9093 ms (-66.8%)
NUM_FILES=10000, after patches 1/5 to 4/5 applied: 9016 ms (-67.1%)
NUM_FILES=5000, before patchset: 9241 ms
NUM_FILES=5000, after patches 1/5 to 3/5 applied: 4642 ms (-49.8%)
NUM_FILES=5000, after patches 1/5 to 4/5 applied: 4553 ms (-50.7%)
NUM_FILES=2000, before patchset: 2550 ms
NUM_FILES=2000, after patches 1/5 to 3/5 applied: 1788 ms (-29.9%)
NUM_FILES=2000, after patches 1/5 to 4/5 applied: 1767 ms (-30.7%)
NUM_FILES=1000, before patchset: 1088 ms
NUM_FILES=1000, after patches 1/5 to 3/5 applied: 905 ms (-16.9%)
NUM_FILES=1000, after patches 1/5 to 4/5 applied: 883 ms (-18.8%)
The next patch in the series (5/5), also contains dbench results after
applying to whole patchset.
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:09 +00:00
|
|
|
|
|
|
|
|
/* Now unpin the logs. */
|
|
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
|
btrfs_end_log_trans(root);
|
|
|
|
|
if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
|
2016-03-17 15:23:38 +01:00
|
|
|
btrfs_end_log_trans(dest);
|
|
|
|
|
out_fail:
|
btrfs: do not commit logs and transactions during link and rename operations
Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we
started to commit logs, and fallback to transaction commits when we failed
to log the new names or commit the logs, after link and rename operations
when the target inodes (or their parents) were previously logged in the
current transaction. This was to avoid losing directories despite an
explicit fsync on them when they are ancestors of some inode that got a
new named logged, due to a link or rename operation. However that adds the
cost of starting IO and waiting for it to complete, which can cause higher
latencies for applications.
Instead of doing that, just make sure that when we log a new name for an
inode we don't mark any of its ancestors as logged, so that if any one
does an fsync against any of them, without doing any other change on them,
the fsync commits the log. This way we only pay the cost of a log commit
(or a transaction commit if something goes wrong or a new block group was
created) if the application explicitly asks to fsync any of the parent
directories.
Using dbench, which mixes several filesystems operations including renames,
revealed some significant latency gains. The following script that uses
dbench was used to test this:
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/btrfs
MOUNT_OPTIONS="-o ssd -o space_cache=v2"
MKFS_OPTIONS="-m single -d single"
THREADS=16
echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -t 300 -D $MNT $THREADS
umount $MNT
The test was run on bare metal, no virtualization, on a box with 12 cores
(Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel
configuration that is the default of typical distributions (debian in this
case), without debug options enabled (kasan, kmemleak, slub debug, debug
of page allocations, lock debugging, etc).
Results before this patch:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 10750455 0.011 155.088
Close 7896674 0.001 0.243
Rename 455222 2.158 1101.947
Unlink 2171189 0.067 121.638
Deltree 256 2.425 7.816
Mkdir 128 0.002 0.003
Qpathinfo 9744323 0.006 21.370
Qfileinfo 1707092 0.001 0.146
Qfsinfo 1786756 0.001 11.228
Sfileinfo 875612 0.003 21.263
Find 3767281 0.025 9.617
WriteX 5356924 0.011 211.390
ReadX 16852694 0.003 9.442
LockX 35008 0.002 0.119
UnlockX 35008 0.001 0.138
Flush 753458 4.252 1102.249
Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms
Results after this patch:
16 clients, after
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 11471098 0.012 448.281
Close 8426396 0.001 0.925
Rename 485746 0.123 267.183
Unlink 2316477 0.080 63.433
Deltree 288 2.830 11.144
Mkdir 144 0.003 0.010
Qpathinfo 10397420 0.006 10.288
Qfileinfo 1822039 0.001 0.169
Qfsinfo 1906497 0.002 14.039
Sfileinfo 934433 0.004 2.438
Find 4019879 0.026 10.200
WriteX 5718932 0.011 200.985
ReadX 17981671 0.003 10.036
LockX 37352 0.002 0.076
UnlockX 37352 0.001 0.109
Flush 804018 5.015 778.033
Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms
(+6.5% throughput, -29.4% max latency, -75.8% rename latency)
Test case generic/498 from fstests tests the scenario that the previously
mentioned commit fixed.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-11 12:43:48 +01:00
|
|
|
ret2 = btrfs_end_transaction(trans);
|
|
|
|
|
ret = ret ? ret : ret2;
|
2016-03-17 15:23:38 +01:00
|
|
|
out_notrans:
|
2019-11-19 13:59:20 -05:00
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
|
|
|
|
|
old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 18:54:23 -04:00
|
|
|
up_read(&fs_info->subvol_sem);
|
2016-03-17 15:23:38 +01:00
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
|
|
|
|
|
struct inode *dir)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (inode) {
|
|
|
|
|
inode_init_owner(mnt_userns, inode, dir,
|
|
|
|
|
S_IFCHR | WHITEOUT_MODE);
|
|
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
|
|
|
|
init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
|
|
|
|
|
}
|
|
|
|
|
return inode;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-27 12:48:42 +02:00
|
|
|
static int btrfs_rename(struct user_namespace *mnt_userns,
|
|
|
|
|
struct inode *old_dir, struct dentry *old_dentry,
|
|
|
|
|
struct inode *new_dir, struct dentry *new_dentry,
|
|
|
|
|
unsigned int flags)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
|
2022-03-14 18:12:34 -07:00
|
|
|
struct btrfs_new_inode_args whiteout_args = {
|
|
|
|
|
.dir = old_dir,
|
|
|
|
|
.dentry = old_dentry,
|
|
|
|
|
};
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
2016-05-05 10:26:26 +01:00
|
|
|
unsigned int trans_num_items;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(old_dir)->root;
|
2009-09-21 15:56:00 -04:00
|
|
|
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *new_inode = d_inode(new_dentry);
|
|
|
|
|
struct inode *old_inode = d_inode(old_dentry);
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
struct btrfs_rename_ctx rename_ctx;
|
2008-08-05 11:18:09 -04:00
|
|
|
u64 index = 0;
|
2007-06-12 06:35:45 -04:00
|
|
|
int ret;
|
btrfs: do not commit logs and transactions during link and rename operations
Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we
started to commit logs, and fallback to transaction commits when we failed
to log the new names or commit the logs, after link and rename operations
when the target inodes (or their parents) were previously logged in the
current transaction. This was to avoid losing directories despite an
explicit fsync on them when they are ancestors of some inode that got a
new named logged, due to a link or rename operation. However that adds the
cost of starting IO and waiting for it to complete, which can cause higher
latencies for applications.
Instead of doing that, just make sure that when we log a new name for an
inode we don't mark any of its ancestors as logged, so that if any one
does an fsync against any of them, without doing any other change on them,
the fsync commits the log. This way we only pay the cost of a log commit
(or a transaction commit if something goes wrong or a new block group was
created) if the application explicitly asks to fsync any of the parent
directories.
Using dbench, which mixes several filesystems operations including renames,
revealed some significant latency gains. The following script that uses
dbench was used to test this:
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/btrfs
MOUNT_OPTIONS="-o ssd -o space_cache=v2"
MKFS_OPTIONS="-m single -d single"
THREADS=16
echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -t 300 -D $MNT $THREADS
umount $MNT
The test was run on bare metal, no virtualization, on a box with 12 cores
(Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel
configuration that is the default of typical distributions (debian in this
case), without debug options enabled (kasan, kmemleak, slub debug, debug
of page allocations, lock debugging, etc).
Results before this patch:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 10750455 0.011 155.088
Close 7896674 0.001 0.243
Rename 455222 2.158 1101.947
Unlink 2171189 0.067 121.638
Deltree 256 2.425 7.816
Mkdir 128 0.002 0.003
Qpathinfo 9744323 0.006 21.370
Qfileinfo 1707092 0.001 0.146
Qfsinfo 1786756 0.001 11.228
Sfileinfo 875612 0.003 21.263
Find 3767281 0.025 9.617
WriteX 5356924 0.011 211.390
ReadX 16852694 0.003 9.442
LockX 35008 0.002 0.119
UnlockX 35008 0.001 0.138
Flush 753458 4.252 1102.249
Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms
Results after this patch:
16 clients, after
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 11471098 0.012 448.281
Close 8426396 0.001 0.925
Rename 485746 0.123 267.183
Unlink 2316477 0.080 63.433
Deltree 288 2.830 11.144
Mkdir 144 0.003 0.010
Qpathinfo 10397420 0.006 10.288
Qfileinfo 1822039 0.001 0.169
Qfsinfo 1906497 0.002 14.039
Sfileinfo 934433 0.004 2.438
Find 4019879 0.026 10.200
WriteX 5718932 0.011 200.985
ReadX 17981671 0.003 10.036
LockX 37352 0.002 0.076
UnlockX 37352 0.001 0.109
Flush 804018 5.015 778.033
Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms
(+6.5% throughput, -29.4% max latency, -75.8% rename latency)
Test case generic/498 from fstests tests the scenario that the previously
mentioned commit fixed.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-11 12:43:48 +01:00
|
|
|
int ret2;
|
2017-01-10 20:35:31 +02:00
|
|
|
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2017-01-10 20:35:31 +02:00
|
|
|
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2009-09-24 09:17:31 -04:00
|
|
|
return -EPERM;
|
|
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
/* we only allow rename subvolume link between subvolumes */
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
|
2008-11-17 20:42:26 -05:00
|
|
|
return -EXDEV;
|
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
|
2017-01-10 20:35:31 +02:00
|
|
|
(new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
|
2007-06-12 06:35:45 -04:00
|
|
|
return -ENOTEMPTY;
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2009-09-21 15:56:00 -04:00
|
|
|
if (S_ISDIR(old_inode->i_mode) && new_inode &&
|
|
|
|
|
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
|
|
|
|
return -ENOTEMPTY;
|
2012-12-17 14:26:57 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
/* check for collisions, even if the name isn't there */
|
2013-10-09 12:24:04 -04:00
|
|
|
ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
|
2012-12-17 14:26:57 -05:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
|
if (ret == -EEXIST) {
|
|
|
|
|
/* we shouldn't get
|
|
|
|
|
* eexist without a new_inode */
|
2013-10-31 10:30:08 +05:30
|
|
|
if (WARN_ON(!new_inode)) {
|
2012-12-17 14:26:57 -05:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* maybe -EOVERFLOW */
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
2009-03-31 13:27:11 -04:00
|
|
|
/*
|
2014-08-12 10:47:42 -07:00
|
|
|
* we're using rename to replace one file with another. Start IO on it
|
|
|
|
|
* now so we don't add too much work to the end of the transaction
|
2009-03-31 13:27:11 -04:00
|
|
|
*/
|
2014-08-12 10:47:42 -07:00
|
|
|
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
|
2009-03-31 13:27:11 -04:00
|
|
|
filemap_flush(old_inode->i_mapping);
|
|
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
if (flags & RENAME_WHITEOUT) {
|
2022-03-14 18:12:34 -07:00
|
|
|
whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
|
|
|
|
|
if (!whiteout_args.inode)
|
2022-03-14 18:12:32 -07:00
|
|
|
return -ENOMEM;
|
2022-03-14 18:12:34 -07:00
|
|
|
ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_whiteout_inode;
|
|
|
|
|
} else {
|
|
|
|
|
/* 1 to update the old parent inode. */
|
|
|
|
|
trans_num_items = 1;
|
2022-03-14 18:12:32 -07:00
|
|
|
}
|
|
|
|
|
|
2022-03-09 17:31:32 -08:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
|
/* Close the race window with snapshot create/destroy ioctl */
|
2016-06-22 18:54:23 -04:00
|
|
|
down_read(&fs_info->subvol_sem);
|
2022-03-09 17:31:32 -08:00
|
|
|
/*
|
|
|
|
|
* 1 to remove old root ref
|
|
|
|
|
* 1 to remove old root backref
|
|
|
|
|
* 1 to add new root ref
|
|
|
|
|
* 1 to add new root backref
|
|
|
|
|
*/
|
2022-03-14 18:12:34 -07:00
|
|
|
trans_num_items += 4;
|
2022-03-09 17:31:32 -08:00
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* 1 to update inode
|
|
|
|
|
* 1 to remove old inode ref
|
|
|
|
|
* 1 to add new inode ref
|
|
|
|
|
*/
|
2022-03-14 18:12:34 -07:00
|
|
|
trans_num_items += 3;
|
2022-03-09 17:31:32 -08:00
|
|
|
}
|
2010-05-16 10:48:46 -04:00
|
|
|
/*
|
2022-03-09 17:31:32 -08:00
|
|
|
* 1 to remove old dir item
|
|
|
|
|
* 1 to remove old dir index
|
|
|
|
|
* 1 to add new dir item
|
|
|
|
|
* 1 to add new dir index
|
2010-05-16 10:48:46 -04:00
|
|
|
*/
|
2022-03-14 18:12:34 -07:00
|
|
|
trans_num_items += 4;
|
|
|
|
|
/* 1 to update new parent inode if it's not the same as the old parent */
|
2022-03-09 17:31:32 -08:00
|
|
|
if (new_dir != old_dir)
|
|
|
|
|
trans_num_items++;
|
|
|
|
|
if (new_inode) {
|
|
|
|
|
/*
|
|
|
|
|
* 1 to update inode
|
|
|
|
|
* 1 to remove inode ref
|
|
|
|
|
* 1 to remove dir item
|
|
|
|
|
* 1 to remove dir index
|
|
|
|
|
* 1 to possibly add orphan item
|
|
|
|
|
*/
|
|
|
|
|
trans_num_items += 5;
|
|
|
|
|
}
|
2016-05-05 10:26:26 +01:00
|
|
|
trans = btrfs_start_transaction(root, trans_num_items);
|
2011-03-31 13:23:47 +00:00
|
|
|
if (IS_ERR(trans)) {
|
2016-03-17 15:23:38 +01:00
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
goto out_notrans;
|
|
|
|
|
}
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2021-03-12 15:25:03 -05:00
|
|
|
if (dest != root) {
|
|
|
|
|
ret = btrfs_record_root_in_trans(trans, dest);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
|
|
|
|
}
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2017-02-20 13:50:33 +02:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
|
2009-09-24 09:17:31 -04:00
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
2009-03-31 13:27:11 -04:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
BTRFS_I(old_inode)->dir_index = 0ULL;
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 15:56:00 -04:00
|
|
|
/* force full log commit if subvolume involved. */
|
2019-03-20 13:28:05 +01:00
|
|
|
btrfs_set_log_full_commit(trans);
|
2009-09-21 15:56:00 -04:00
|
|
|
} else {
|
2009-09-24 09:17:31 -04:00
|
|
|
ret = btrfs_insert_inode_ref(trans, dest,
|
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
|
new_dentry->d_name.len,
|
2011-04-20 10:31:50 +08:00
|
|
|
old_ino,
|
2017-01-10 20:35:31 +02:00
|
|
|
btrfs_ino(BTRFS_I(new_dir)), index);
|
2009-09-24 09:17:31 -04:00
|
|
|
if (ret)
|
|
|
|
|
goto out_fail;
|
2009-09-21 15:56:00 -04:00
|
|
|
}
|
2009-03-31 13:27:11 -04:00
|
|
|
|
2012-04-05 15:03:02 -04:00
|
|
|
inode_inc_iversion(old_dir);
|
|
|
|
|
inode_inc_iversion(new_dir);
|
|
|
|
|
inode_inc_iversion(old_inode);
|
2022-06-21 18:40:48 +02:00
|
|
|
old_dir->i_mtime = current_time(old_dir);
|
|
|
|
|
old_dir->i_ctime = old_dir->i_mtime;
|
|
|
|
|
new_dir->i_mtime = old_dir->i_mtime;
|
|
|
|
|
new_dir->i_ctime = old_dir->i_mtime;
|
|
|
|
|
old_inode->i_ctime = old_dir->i_mtime;
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2009-03-24 10:24:20 -04:00
|
|
|
if (old_dentry->d_parent != new_dentry->d_parent)
|
2017-01-20 14:54:07 +01:00
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
|
|
|
|
|
BTRFS_I(old_inode), 1);
|
2009-03-24 10:24:20 -04:00
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2019-12-18 17:20:27 -05:00
|
|
|
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
|
2009-09-21 15:56:00 -04:00
|
|
|
} else {
|
2021-10-25 17:31:50 +01:00
|
|
|
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
|
2017-01-18 00:31:44 +02:00
|
|
|
BTRFS_I(d_inode(old_dentry)),
|
2011-03-04 17:14:37 +00:00
|
|
|
old_dentry->d_name.name,
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
old_dentry->d_name.len,
|
|
|
|
|
&rename_ctx);
|
2011-03-04 17:14:37 +00:00
|
|
|
if (!ret)
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
|
2009-09-21 15:56:00 -04:00
|
|
|
}
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
|
|
|
|
if (new_inode) {
|
2012-04-05 15:03:02 -04:00
|
|
|
inode_inc_iversion(new_inode);
|
2016-09-14 07:48:06 -07:00
|
|
|
new_inode->i_ctime = current_time(new_inode);
|
2017-01-10 20:35:31 +02:00
|
|
|
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
|
2009-09-21 15:56:00 -04:00
|
|
|
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
2019-12-18 17:20:27 -05:00
|
|
|
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
|
2009-09-21 15:56:00 -04:00
|
|
|
BUG_ON(new_inode->i_nlink == 0);
|
|
|
|
|
} else {
|
2021-10-25 17:31:50 +01:00
|
|
|
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
|
2017-01-18 00:31:44 +02:00
|
|
|
BTRFS_I(d_inode(new_dentry)),
|
2009-09-21 15:56:00 -04:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
|
}
|
2013-08-13 14:10:08 -04:00
|
|
|
if (!ret && new_inode->i_nlink == 0)
|
2017-02-20 13:50:59 +02:00
|
|
|
ret = btrfs_orphan_add(trans,
|
|
|
|
|
BTRFS_I(d_inode(new_dentry)));
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
}
|
2008-07-24 12:12:38 -04:00
|
|
|
|
2017-02-20 13:51:08 +02:00
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
|
2009-09-21 15:56:00 -04:00
|
|
|
new_dentry->d_name.name,
|
2009-09-24 09:17:31 -04:00
|
|
|
new_dentry->d_name.len, 0, index);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
goto out_fail;
|
|
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
if (old_inode->i_nlink == 1)
|
|
|
|
|
BTRFS_I(old_inode)->dir_index = index;
|
|
|
|
|
|
btrfs: stop doing unnecessary log updates during a rename
During a rename, we call __btrfs_unlink_inode(), which will call
btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(), in order
to remove an inode reference and a directory entry from the log. These
are necessary when __btrfs_unlink_inode() is called from the unlink path,
but not necessary when it's called from a rename context, because:
1) For the btrfs_del_inode_ref_in_log() call, it's pointless to delete the
inode reference related to the old name, because later in the rename
path we call btrfs_log_new_name(), which will drop all inode references
from the log and copy all inode references from the subvolume tree to
the log tree. So we are doing one unnecessary btree operation which
adds additional latency and lock contention in case there are other
tasks accessing the log tree;
2) For the btrfs_del_dir_entries_in_log() call, we are now doing the
equivalent at btrfs_log_new_name() since the previous patch in the
series, that has the subject "btrfs: avoid logging all directory
changes during renames". In fact, having __btrfs_unlink_inode() call
this function not only adds additional latency and lock contention due
to the extra btree operation, but also can make btrfs_log_new_name()
unnecessarily log a range item to track the deletion of the old name,
since it has no way to known that the directory entry related to the
old name was previously logged and already deleted by
__btrfs_unlink_inode() through its call to
btrfs_del_dir_entries_in_log().
So skip those calls at __btrfs_unlink_inode() when we are doing a rename.
Skipping them also allows us now to reduce the duration of time we are
pinning a log transaction during renames, which is always beneficial as
it's not delaying so much other tasks trying to sync the log tree, in
particular we end up not holding the log transaction pinned while adding
the new name (adding inode ref, directory entry, etc).
This change is part of a patchset comprised of the following patches:
1/5 btrfs: add helper to delete a dir entry from a log tree
2/5 btrfs: pass the dentry to btrfs_log_new_name() instead of the inode
3/5 btrfs: avoid logging all directory changes during renames
4/5 btrfs: stop doing unnecessary log updates during a rename
5/5 btrfs: avoid inode logging during rename and link when possible
Just like the previous patch in the series, "btrfs: avoid logging all
directory changes during renames", the following script mimics part of
what a package installation/upgrade with zypper does, which is basically
renaming a lot of files, in some directory under /usr, to a name with a
suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box a using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before patchset: 27399 ms
NUM_FILES=10000, after patches 1/5 to 3/5 applied: 9093 ms (-66.8%)
NUM_FILES=10000, after patches 1/5 to 4/5 applied: 9016 ms (-67.1%)
NUM_FILES=5000, before patchset: 9241 ms
NUM_FILES=5000, after patches 1/5 to 3/5 applied: 4642 ms (-49.8%)
NUM_FILES=5000, after patches 1/5 to 4/5 applied: 4553 ms (-50.7%)
NUM_FILES=2000, before patchset: 2550 ms
NUM_FILES=2000, after patches 1/5 to 3/5 applied: 1788 ms (-29.9%)
NUM_FILES=2000, after patches 1/5 to 4/5 applied: 1767 ms (-30.7%)
NUM_FILES=1000, before patchset: 1088 ms
NUM_FILES=1000, after patches 1/5 to 3/5 applied: 905 ms (-16.9%)
NUM_FILES=1000, after patches 1/5 to 4/5 applied: 883 ms (-18.8%)
The next patch in the series (5/5), also contains dbench results after
applying to whole patchset.
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:09 +00:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
|
2022-01-20 11:00:07 +00:00
|
|
|
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
|
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory
were logged before, we log the new name of the file and then make sure
we log the old parent directory, to ensure that after a log replay the
old name of the file is deleted and the new name added.
The logging of the old parent directory can take some time, because it
will scan all leaves modified in the current transaction, check which
directory entries were already logged, copy the ones that were not
logged before, etc. In this rename context all we need to do is make
sure that the old name of the file is deleted on log replay, so instead
of triggering a directory log operation, we can just delete the old
directory entry from the log if it's there, or in case it isn't there,
just log a range item to signal log replay that the old name must be
deleted. So change btrfs_log_new_name() to do that.
This scenario is actually not uncommon to trigger, and recently on a
5.15 kernel, an openSUSE Tumbleweed user reported package installations
and upgrades, with the zypper tool, were often taking a long time to
complete, much more than usual. With strace it could be observed that
zypper was spending over 99% of its time on rename operations, and then
with further analysis we checked that directory logging was happening
too frequently and causing high latencies for the rename operations.
Taking into account that installation/upgrade of some of these packages
needed about a few thousand file renames, the slowdown was very noticeable
for the user.
The issue was caused indirectly due to an excessive number of inode
evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14
or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure,
in an efficient way, if an inode was previously logged in the current
transaction, so we are pessimistic and assume it was, because in case
it was we need to update the logged inode. More details on that in one
of the patches in the same series (subject "btrfs: avoid inode logging
during rename and link when possible"). Either way, in case the parent
directory was logged before, we currently do more work then necessary
during a rename, and this change minimizes that amount of work.
The following script mimics part of what a package installation/upgrade
with zypper does, which is basically renaming a lot of files, in some
directory under /usr, to a name with a suffix of "-RPMDELETE":
$ cat test.sh
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/nvme0n1
NUM_FILES=10000
mkfs.btrfs -f $DEV
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= $NUM_FILES; i++)); do
echo -n > $MNT/testdir/file_$i
done
sync
# Do some change to testdir and fsync it.
echo -n > $MNT/testdir/file_$((NUM_FILES + 1))
xfs_io -c "fsync" $MNT/testdir
echo "Renaming $NUM_FILES files..."
start=$(date +%s%N)
for ((i = 1; i <= $NUM_FILES; i++)); do
mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE
done
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Renames took $dur milliseconds"
umount $MNT
Testing this change on box using a non-debug kernel (Debian's default
kernel config) gave the following results:
NUM_FILES=10000, before this patch: 27399 ms
NUM_FILES=10000, after this patch: 9093 ms (-66.8%)
NUM_FILES=5000, before this patch: 9241 ms
NUM_FILES=5000, after this patch: 4642 ms (-49.8%)
NUM_FILES=2000, before this patch: 2550 ms
NUM_FILES=2000, after this patch: 1788 ms (-29.9%)
NUM_FILES=1000, before this patch: 1088 ms
NUM_FILES=1000, after this patch: 905 ms (-16.9%)
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-01-20 11:00:08 +00:00
|
|
|
rename_ctx.index, new_dentry->d_parent);
|
2016-03-17 15:23:38 +01:00
|
|
|
|
|
|
|
|
if (flags & RENAME_WHITEOUT) {
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
ret = btrfs_create_new_inode(trans, &whiteout_args);
|
2016-03-17 15:23:38 +01:00
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 15:23:38 +01:00
|
|
|
goto out_fail;
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
} else {
|
|
|
|
|
unlock_new_inode(whiteout_args.inode);
|
|
|
|
|
iput(whiteout_args.inode);
|
|
|
|
|
whiteout_args.inode = NULL;
|
2016-03-17 15:23:38 +01:00
|
|
|
}
|
2009-09-21 15:56:00 -04:00
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
out_fail:
|
btrfs: do not commit logs and transactions during link and rename operations
Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we
started to commit logs, and fallback to transaction commits when we failed
to log the new names or commit the logs, after link and rename operations
when the target inodes (or their parents) were previously logged in the
current transaction. This was to avoid losing directories despite an
explicit fsync on them when they are ancestors of some inode that got a
new named logged, due to a link or rename operation. However that adds the
cost of starting IO and waiting for it to complete, which can cause higher
latencies for applications.
Instead of doing that, just make sure that when we log a new name for an
inode we don't mark any of its ancestors as logged, so that if any one
does an fsync against any of them, without doing any other change on them,
the fsync commits the log. This way we only pay the cost of a log commit
(or a transaction commit if something goes wrong or a new block group was
created) if the application explicitly asks to fsync any of the parent
directories.
Using dbench, which mixes several filesystems operations including renames,
revealed some significant latency gains. The following script that uses
dbench was used to test this:
#!/bin/bash
DEV=/dev/nvme0n1
MNT=/mnt/btrfs
MOUNT_OPTIONS="-o ssd -o space_cache=v2"
MKFS_OPTIONS="-m single -d single"
THREADS=16
echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
dbench -t 300 -D $MNT $THREADS
umount $MNT
The test was run on bare metal, no virtualization, on a box with 12 cores
(Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel
configuration that is the default of typical distributions (debian in this
case), without debug options enabled (kasan, kmemleak, slub debug, debug
of page allocations, lock debugging, etc).
Results before this patch:
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 10750455 0.011 155.088
Close 7896674 0.001 0.243
Rename 455222 2.158 1101.947
Unlink 2171189 0.067 121.638
Deltree 256 2.425 7.816
Mkdir 128 0.002 0.003
Qpathinfo 9744323 0.006 21.370
Qfileinfo 1707092 0.001 0.146
Qfsinfo 1786756 0.001 11.228
Sfileinfo 875612 0.003 21.263
Find 3767281 0.025 9.617
WriteX 5356924 0.011 211.390
ReadX 16852694 0.003 9.442
LockX 35008 0.002 0.119
UnlockX 35008 0.001 0.138
Flush 753458 4.252 1102.249
Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms
Results after this patch:
16 clients, after
Operation Count AvgLat MaxLat
----------------------------------------
NTCreateX 11471098 0.012 448.281
Close 8426396 0.001 0.925
Rename 485746 0.123 267.183
Unlink 2316477 0.080 63.433
Deltree 288 2.830 11.144
Mkdir 144 0.003 0.010
Qpathinfo 10397420 0.006 10.288
Qfileinfo 1822039 0.001 0.169
Qfsinfo 1906497 0.002 14.039
Sfileinfo 934433 0.004 2.438
Find 4019879 0.026 10.200
WriteX 5718932 0.011 200.985
ReadX 17981671 0.003 10.036
LockX 37352 0.002 0.076
UnlockX 37352 0.001 0.109
Flush 804018 5.015 778.033
Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms
(+6.5% throughput, -29.4% max latency, -75.8% rename latency)
Test case generic/498 from fstests tests the scenario that the previously
mentioned commit fixed.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-11 12:43:48 +01:00
|
|
|
ret2 = btrfs_end_transaction(trans);
|
|
|
|
|
ret = ret ? ret : ret2;
|
2011-03-31 13:23:47 +00:00
|
|
|
out_notrans:
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 18:54:23 -04:00
|
|
|
up_read(&fs_info->subvol_sem);
|
2022-03-14 18:12:32 -07:00
|
|
|
if (flags & RENAME_WHITEOUT)
|
2022-03-14 18:12:34 -07:00
|
|
|
btrfs_new_inode_args_destroy(&whiteout_args);
|
|
|
|
|
out_whiteout_inode:
|
|
|
|
|
if (flags & RENAME_WHITEOUT)
|
|
|
|
|
iput(whiteout_args.inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
|
|
|
|
|
struct dentry *old_dentry, struct inode *new_dir,
|
|
|
|
|
struct dentry *new_dentry, unsigned int flags)
|
2014-07-23 15:15:32 +02:00
|
|
|
{
|
2022-05-31 16:06:32 +01:00
|
|
|
int ret;
|
|
|
|
|
|
2016-03-17 15:23:38 +01:00
|
|
|
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
|
2014-07-23 15:15:32 +02:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
2016-03-17 15:23:38 +01:00
|
|
|
if (flags & RENAME_EXCHANGE)
|
2022-05-31 16:06:32 +01:00
|
|
|
ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
|
|
|
|
|
new_dentry);
|
|
|
|
|
else
|
|
|
|
|
ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
|
|
|
|
|
new_dentry, flags);
|
2016-03-17 15:23:38 +01:00
|
|
|
|
2022-05-31 16:06:32 +01:00
|
|
|
btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
|
|
|
|
|
|
|
|
|
|
return ret;
|
2014-07-23 15:15:32 +02:00
|
|
|
}
|
|
|
|
|
|
2018-04-24 17:23:59 +03:00
|
|
|
struct btrfs_delalloc_work {
|
|
|
|
|
struct inode *inode;
|
|
|
|
|
struct completion completion;
|
|
|
|
|
struct list_head list;
|
|
|
|
|
struct btrfs_work work;
|
|
|
|
|
};
|
|
|
|
|
|
2012-10-25 09:28:04 +00:00
|
|
|
static void btrfs_run_delalloc_work(struct btrfs_work *work)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_delalloc_work *delalloc_work;
|
2013-10-28 15:03:41 -04:00
|
|
|
struct inode *inode;
|
2012-10-25 09:28:04 +00:00
|
|
|
|
|
|
|
|
delalloc_work = container_of(work, struct btrfs_delalloc_work,
|
|
|
|
|
work);
|
2013-10-28 15:03:41 -04:00
|
|
|
inode = delalloc_work->inode;
|
2015-11-27 19:27:11 +01:00
|
|
|
filemap_flush(inode->i_mapping);
|
|
|
|
|
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2013-10-28 15:03:41 -04:00
|
|
|
filemap_flush(inode->i_mapping);
|
2012-10-25 09:28:04 +00:00
|
|
|
|
2018-04-23 10:54:16 +03:00
|
|
|
iput(inode);
|
2012-10-25 09:28:04 +00:00
|
|
|
complete(&delalloc_work->completion);
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-24 17:23:59 +03:00
|
|
|
static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
|
2012-10-25 09:28:04 +00:00
|
|
|
{
|
|
|
|
|
struct btrfs_delalloc_work *work;
|
|
|
|
|
|
2015-12-08 14:39:32 +01:00
|
|
|
work = kmalloc(sizeof(*work), GFP_NOFS);
|
2012-10-25 09:28:04 +00:00
|
|
|
if (!work)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
init_completion(&work->completion);
|
|
|
|
|
INIT_LIST_HEAD(&work->list);
|
|
|
|
|
work->inode = inode;
|
2019-09-16 11:30:57 -07:00
|
|
|
btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
|
2012-10-25 09:28:04 +00:00
|
|
|
|
|
|
|
|
return work;
|
|
|
|
|
}
|
|
|
|
|
|
2008-09-29 15:18:18 -04:00
|
|
|
/*
|
|
|
|
|
* some fairly slow code that needs optimization. This walks the list
|
|
|
|
|
* of all the inodes with pending delalloc and forces them to disk.
|
|
|
|
|
*/
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
static int start_delalloc_inodes(struct btrfs_root *root,
|
|
|
|
|
struct writeback_control *wbc, bool snapshot,
|
btrfs: fix deadlock when cloning inline extent and low on free metadata space
When cloning an inline extent there are cases where we can not just copy
the inline extent from the source range to the target range (e.g. when the
target range starts at an offset greater than zero). In such cases we copy
the inline extent's data into a page of the destination inode and then
dirty that page. However, after that we will need to start a transaction
for each processed extent and, if we are ever low on available metadata
space, we may need to flush existing delalloc for all dirty inodes in an
attempt to release metadata space - if that happens we may deadlock:
* the async reclaim task queued a delalloc work to flush delalloc for
the destination inode of the clone operation;
* the task executing that delalloc work gets blocked waiting for the
range with the dirty page to be unlocked, which is currently locked
by the task doing the clone operation;
* the async reclaim task blocks waiting for the delalloc work to complete;
* the cloning task is waiting on the waitqueue of its reservation ticket
while holding the range with the dirty page locked in the inode's
io_tree;
* if metadata space is not released by some other task (like delalloc for
some other inode completing for example), the clone task waits forever
and as a consequence the delalloc work and async reclaim tasks will hang
forever as well. Releasing more space on the other hand may require
starting a transaction, which will hang as well when trying to reserve
metadata space, resulting in a deadlock between all these tasks.
When this happens, traces like the following show up in dmesg/syslog:
[87452.323003] INFO: task kworker/u16:11:1810830 blocked for more than 120 seconds.
[87452.323644] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
[87452.324248] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[87452.324852] task:kworker/u16:11 state:D stack: 0 pid:1810830 ppid: 2 flags:0x00004000
[87452.325520] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[87452.326136] Call Trace:
[87452.326737] __schedule+0x5d1/0xcf0
[87452.327390] schedule+0x45/0xe0
[87452.328174] lock_extent_bits+0x1e6/0x2d0 [btrfs]
[87452.328894] ? finish_wait+0x90/0x90
[87452.329474] btrfs_invalidatepage+0x32c/0x390 [btrfs]
[87452.330133] ? __mod_memcg_state+0x8e/0x160
[87452.330738] __extent_writepage+0x2d4/0x400 [btrfs]
[87452.331405] extent_write_cache_pages+0x2b2/0x500 [btrfs]
[87452.332007] ? lock_release+0x20e/0x4c0
[87452.332557] ? trace_hardirqs_on+0x1b/0xf0
[87452.333127] extent_writepages+0x43/0x90 [btrfs]
[87452.333653] ? lock_acquire+0x1a3/0x490
[87452.334177] do_writepages+0x43/0xe0
[87452.334699] ? __filemap_fdatawrite_range+0xa4/0x100
[87452.335720] __filemap_fdatawrite_range+0xc5/0x100
[87452.336500] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
[87452.337216] btrfs_work_helper+0xf1/0x600 [btrfs]
[87452.337838] process_one_work+0x24e/0x5e0
[87452.338437] worker_thread+0x50/0x3b0
[87452.339137] ? process_one_work+0x5e0/0x5e0
[87452.339884] kthread+0x153/0x170
[87452.340507] ? kthread_mod_delayed_work+0xc0/0xc0
[87452.341153] ret_from_fork+0x22/0x30
[87452.341806] INFO: task kworker/u16:1:2426217 blocked for more than 120 seconds.
[87452.342487] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
[87452.343274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[87452.344049] task:kworker/u16:1 state:D stack: 0 pid:2426217 ppid: 2 flags:0x00004000
[87452.344974] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
[87452.345655] Call Trace:
[87452.346305] __schedule+0x5d1/0xcf0
[87452.346947] ? kvm_clock_read+0x14/0x30
[87452.347676] ? wait_for_completion+0x81/0x110
[87452.348389] schedule+0x45/0xe0
[87452.349077] schedule_timeout+0x30c/0x580
[87452.349718] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[87452.350340] ? lock_acquire+0x1a3/0x490
[87452.351006] ? try_to_wake_up+0x7a/0xa20
[87452.351541] ? lock_release+0x20e/0x4c0
[87452.352040] ? lock_acquired+0x199/0x490
[87452.352517] ? wait_for_completion+0x81/0x110
[87452.353000] wait_for_completion+0xab/0x110
[87452.353490] start_delalloc_inodes+0x2af/0x390 [btrfs]
[87452.353973] btrfs_start_delalloc_roots+0x12d/0x250 [btrfs]
[87452.354455] flush_space+0x24f/0x660 [btrfs]
[87452.355063] btrfs_async_reclaim_metadata_space+0x1bb/0x480 [btrfs]
[87452.355565] process_one_work+0x24e/0x5e0
[87452.356024] worker_thread+0x20f/0x3b0
[87452.356487] ? process_one_work+0x5e0/0x5e0
[87452.356973] kthread+0x153/0x170
[87452.357434] ? kthread_mod_delayed_work+0xc0/0xc0
[87452.357880] ret_from_fork+0x22/0x30
(...)
< stack traces of several tasks waiting for the locks of the inodes of the
clone operation >
(...)
[92867.444138] RSP: 002b:00007ffc3371bbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
[92867.444624] RAX: ffffffffffffffda RBX: 00007ffc3371bea0 RCX: 00007f61efe73f97
[92867.445116] RDX: 0000000000000000 RSI: 0000560fbd5d7a40 RDI: 0000560fbd5d8960
[92867.445595] RBP: 00007ffc3371beb0 R08: 0000000000000001 R09: 0000000000000003
[92867.446070] R10: 00007ffc3371b996 R11: 0000000000000246 R12: 0000000000000000
[92867.446820] R13: 000000000000001f R14: 00007ffc3371bea0 R15: 00007ffc3371beb0
[92867.447361] task:fsstress state:D stack: 0 pid:2508238 ppid:2508153 flags:0x00004000
[92867.447920] Call Trace:
[92867.448435] __schedule+0x5d1/0xcf0
[92867.448934] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[92867.449423] schedule+0x45/0xe0
[92867.449916] __reserve_bytes+0x4a4/0xb10 [btrfs]
[92867.450576] ? finish_wait+0x90/0x90
[92867.451202] btrfs_reserve_metadata_bytes+0x29/0x190 [btrfs]
[92867.451815] btrfs_block_rsv_add+0x1f/0x50 [btrfs]
[92867.452412] start_transaction+0x2d1/0x760 [btrfs]
[92867.453216] clone_copy_inline_extent+0x333/0x490 [btrfs]
[92867.453848] ? lock_release+0x20e/0x4c0
[92867.454539] ? btrfs_search_slot+0x9a7/0xc30 [btrfs]
[92867.455218] btrfs_clone+0x569/0x7e0 [btrfs]
[92867.455952] btrfs_clone_files+0xf6/0x150 [btrfs]
[92867.456588] btrfs_remap_file_range+0x324/0x3d0 [btrfs]
[92867.457213] do_clone_file_range+0xd4/0x1f0
[92867.457828] vfs_clone_file_range+0x4d/0x230
[92867.458355] ? lock_release+0x20e/0x4c0
[92867.458890] ioctl_file_clone+0x8f/0xc0
[92867.459377] do_vfs_ioctl+0x342/0x750
[92867.459913] __x64_sys_ioctl+0x62/0xb0
[92867.460377] do_syscall_64+0x33/0x80
[92867.460842] entry_SYSCALL_64_after_hwframe+0x44/0xa9
(...)
< stack traces of more tasks blocked on metadata reservation like the clone
task above, because the async reclaim task has deadlocked >
(...)
Another thing to notice is that the worker task that is deadlocked when
trying to flush the destination inode of the clone operation is at
btrfs_invalidatepage(). This is simply because the clone operation has a
destination offset greater than the i_size and we only update the i_size
of the destination file after cloning an extent (just like we do in the
buffered write path).
Since the async reclaim path uses btrfs_start_delalloc_roots() to trigger
the flushing of delalloc for all inodes that have delalloc, add a runtime
flag to an inode to signal it should not be flushed, and for inodes with
that flag set, start_delalloc_inodes() will simply skip them. When the
cloning code needs to dirty a page to copy an inline extent, set that flag
on the inode and then clear it when the clone operation finishes.
This could be sporadically triggered with test case generic/269 from
fstests, which exercises many fsstress processes running in parallel with
several dd processes filling up the entire filesystem.
CC: stable@vger.kernel.org # 5.9+
Fixes: 05a5a7621ce6 ("Btrfs: implement full reflink support for inline extents")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-02 11:55:58 +00:00
|
|
|
bool in_reclaim_context)
|
2008-08-04 23:17:27 -04:00
|
|
|
{
|
|
|
|
|
struct btrfs_inode *binode;
|
2008-09-26 10:05:38 -04:00
|
|
|
struct inode *inode;
|
2012-10-25 09:28:04 +00:00
|
|
|
struct btrfs_delalloc_work *work, *next;
|
|
|
|
|
struct list_head works;
|
2013-01-22 10:49:00 +00:00
|
|
|
struct list_head splice;
|
2012-10-25 09:28:04 +00:00
|
|
|
int ret = 0;
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
bool full_flush = wbc->nr_to_write == LONG_MAX;
|
2008-08-04 23:17:27 -04:00
|
|
|
|
2012-10-25 09:28:04 +00:00
|
|
|
INIT_LIST_HEAD(&works);
|
2013-01-22 10:49:00 +00:00
|
|
|
INIT_LIST_HEAD(&splice);
|
2013-01-22 10:50:35 +00:00
|
|
|
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_lock(&root->delalloc_mutex);
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
|
list_splice_init(&root->delalloc_inodes, &splice);
|
2013-01-22 10:49:00 +00:00
|
|
|
while (!list_empty(&splice)) {
|
|
|
|
|
binode = list_entry(splice.next, struct btrfs_inode,
|
2008-08-04 23:17:27 -04:00
|
|
|
delalloc_inodes);
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
list_move_tail(&binode->delalloc_inodes,
|
|
|
|
|
&root->delalloc_inodes);
|
btrfs: fix deadlock when cloning inline extent and low on free metadata space
When cloning an inline extent there are cases where we can not just copy
the inline extent from the source range to the target range (e.g. when the
target range starts at an offset greater than zero). In such cases we copy
the inline extent's data into a page of the destination inode and then
dirty that page. However, after that we will need to start a transaction
for each processed extent and, if we are ever low on available metadata
space, we may need to flush existing delalloc for all dirty inodes in an
attempt to release metadata space - if that happens we may deadlock:
* the async reclaim task queued a delalloc work to flush delalloc for
the destination inode of the clone operation;
* the task executing that delalloc work gets blocked waiting for the
range with the dirty page to be unlocked, which is currently locked
by the task doing the clone operation;
* the async reclaim task blocks waiting for the delalloc work to complete;
* the cloning task is waiting on the waitqueue of its reservation ticket
while holding the range with the dirty page locked in the inode's
io_tree;
* if metadata space is not released by some other task (like delalloc for
some other inode completing for example), the clone task waits forever
and as a consequence the delalloc work and async reclaim tasks will hang
forever as well. Releasing more space on the other hand may require
starting a transaction, which will hang as well when trying to reserve
metadata space, resulting in a deadlock between all these tasks.
When this happens, traces like the following show up in dmesg/syslog:
[87452.323003] INFO: task kworker/u16:11:1810830 blocked for more than 120 seconds.
[87452.323644] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
[87452.324248] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[87452.324852] task:kworker/u16:11 state:D stack: 0 pid:1810830 ppid: 2 flags:0x00004000
[87452.325520] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[87452.326136] Call Trace:
[87452.326737] __schedule+0x5d1/0xcf0
[87452.327390] schedule+0x45/0xe0
[87452.328174] lock_extent_bits+0x1e6/0x2d0 [btrfs]
[87452.328894] ? finish_wait+0x90/0x90
[87452.329474] btrfs_invalidatepage+0x32c/0x390 [btrfs]
[87452.330133] ? __mod_memcg_state+0x8e/0x160
[87452.330738] __extent_writepage+0x2d4/0x400 [btrfs]
[87452.331405] extent_write_cache_pages+0x2b2/0x500 [btrfs]
[87452.332007] ? lock_release+0x20e/0x4c0
[87452.332557] ? trace_hardirqs_on+0x1b/0xf0
[87452.333127] extent_writepages+0x43/0x90 [btrfs]
[87452.333653] ? lock_acquire+0x1a3/0x490
[87452.334177] do_writepages+0x43/0xe0
[87452.334699] ? __filemap_fdatawrite_range+0xa4/0x100
[87452.335720] __filemap_fdatawrite_range+0xc5/0x100
[87452.336500] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
[87452.337216] btrfs_work_helper+0xf1/0x600 [btrfs]
[87452.337838] process_one_work+0x24e/0x5e0
[87452.338437] worker_thread+0x50/0x3b0
[87452.339137] ? process_one_work+0x5e0/0x5e0
[87452.339884] kthread+0x153/0x170
[87452.340507] ? kthread_mod_delayed_work+0xc0/0xc0
[87452.341153] ret_from_fork+0x22/0x30
[87452.341806] INFO: task kworker/u16:1:2426217 blocked for more than 120 seconds.
[87452.342487] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
[87452.343274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[87452.344049] task:kworker/u16:1 state:D stack: 0 pid:2426217 ppid: 2 flags:0x00004000
[87452.344974] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
[87452.345655] Call Trace:
[87452.346305] __schedule+0x5d1/0xcf0
[87452.346947] ? kvm_clock_read+0x14/0x30
[87452.347676] ? wait_for_completion+0x81/0x110
[87452.348389] schedule+0x45/0xe0
[87452.349077] schedule_timeout+0x30c/0x580
[87452.349718] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[87452.350340] ? lock_acquire+0x1a3/0x490
[87452.351006] ? try_to_wake_up+0x7a/0xa20
[87452.351541] ? lock_release+0x20e/0x4c0
[87452.352040] ? lock_acquired+0x199/0x490
[87452.352517] ? wait_for_completion+0x81/0x110
[87452.353000] wait_for_completion+0xab/0x110
[87452.353490] start_delalloc_inodes+0x2af/0x390 [btrfs]
[87452.353973] btrfs_start_delalloc_roots+0x12d/0x250 [btrfs]
[87452.354455] flush_space+0x24f/0x660 [btrfs]
[87452.355063] btrfs_async_reclaim_metadata_space+0x1bb/0x480 [btrfs]
[87452.355565] process_one_work+0x24e/0x5e0
[87452.356024] worker_thread+0x20f/0x3b0
[87452.356487] ? process_one_work+0x5e0/0x5e0
[87452.356973] kthread+0x153/0x170
[87452.357434] ? kthread_mod_delayed_work+0xc0/0xc0
[87452.357880] ret_from_fork+0x22/0x30
(...)
< stack traces of several tasks waiting for the locks of the inodes of the
clone operation >
(...)
[92867.444138] RSP: 002b:00007ffc3371bbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
[92867.444624] RAX: ffffffffffffffda RBX: 00007ffc3371bea0 RCX: 00007f61efe73f97
[92867.445116] RDX: 0000000000000000 RSI: 0000560fbd5d7a40 RDI: 0000560fbd5d8960
[92867.445595] RBP: 00007ffc3371beb0 R08: 0000000000000001 R09: 0000000000000003
[92867.446070] R10: 00007ffc3371b996 R11: 0000000000000246 R12: 0000000000000000
[92867.446820] R13: 000000000000001f R14: 00007ffc3371bea0 R15: 00007ffc3371beb0
[92867.447361] task:fsstress state:D stack: 0 pid:2508238 ppid:2508153 flags:0x00004000
[92867.447920] Call Trace:
[92867.448435] __schedule+0x5d1/0xcf0
[92867.448934] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[92867.449423] schedule+0x45/0xe0
[92867.449916] __reserve_bytes+0x4a4/0xb10 [btrfs]
[92867.450576] ? finish_wait+0x90/0x90
[92867.451202] btrfs_reserve_metadata_bytes+0x29/0x190 [btrfs]
[92867.451815] btrfs_block_rsv_add+0x1f/0x50 [btrfs]
[92867.452412] start_transaction+0x2d1/0x760 [btrfs]
[92867.453216] clone_copy_inline_extent+0x333/0x490 [btrfs]
[92867.453848] ? lock_release+0x20e/0x4c0
[92867.454539] ? btrfs_search_slot+0x9a7/0xc30 [btrfs]
[92867.455218] btrfs_clone+0x569/0x7e0 [btrfs]
[92867.455952] btrfs_clone_files+0xf6/0x150 [btrfs]
[92867.456588] btrfs_remap_file_range+0x324/0x3d0 [btrfs]
[92867.457213] do_clone_file_range+0xd4/0x1f0
[92867.457828] vfs_clone_file_range+0x4d/0x230
[92867.458355] ? lock_release+0x20e/0x4c0
[92867.458890] ioctl_file_clone+0x8f/0xc0
[92867.459377] do_vfs_ioctl+0x342/0x750
[92867.459913] __x64_sys_ioctl+0x62/0xb0
[92867.460377] do_syscall_64+0x33/0x80
[92867.460842] entry_SYSCALL_64_after_hwframe+0x44/0xa9
(...)
< stack traces of more tasks blocked on metadata reservation like the clone
task above, because the async reclaim task has deadlocked >
(...)
Another thing to notice is that the worker task that is deadlocked when
trying to flush the destination inode of the clone operation is at
btrfs_invalidatepage(). This is simply because the clone operation has a
destination offset greater than the i_size and we only update the i_size
of the destination file after cloning an extent (just like we do in the
buffered write path).
Since the async reclaim path uses btrfs_start_delalloc_roots() to trigger
the flushing of delalloc for all inodes that have delalloc, add a runtime
flag to an inode to signal it should not be flushed, and for inodes with
that flag set, start_delalloc_inodes() will simply skip them. When the
cloning code needs to dirty a page to copy an inline extent, set that flag
on the inode and then clear it when the clone operation finishes.
This could be sporadically triggered with test case generic/269 from
fstests, which exercises many fsstress processes running in parallel with
several dd processes filling up the entire filesystem.
CC: stable@vger.kernel.org # 5.9+
Fixes: 05a5a7621ce6 ("Btrfs: implement full reflink support for inline extents")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-02 11:55:58 +00:00
|
|
|
|
|
|
|
|
if (in_reclaim_context &&
|
|
|
|
|
test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
|
|
|
|
|
continue;
|
|
|
|
|
|
2008-09-26 10:05:38 -04:00
|
|
|
inode = igrab(&binode->vfs_inode);
|
2013-01-29 10:11:59 +00:00
|
|
|
if (!inode) {
|
2013-05-15 07:48:22 +00:00
|
|
|
cond_resched_lock(&root->delalloc_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
continue;
|
2013-01-29 10:11:59 +00:00
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2018-11-01 14:49:03 +08:00
|
|
|
if (snapshot)
|
|
|
|
|
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
|
|
|
|
|
&binode->runtime_flags);
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
if (full_flush) {
|
|
|
|
|
work = btrfs_alloc_delalloc_work(inode);
|
|
|
|
|
if (!work) {
|
|
|
|
|
iput(inode);
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
list_add_tail(&work->list, &works);
|
|
|
|
|
btrfs_queue_work(root->fs_info->flush_workers,
|
|
|
|
|
&work->work);
|
|
|
|
|
} else {
|
2021-07-14 14:47:23 -04:00
|
|
|
ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
btrfs_add_delayed_iput(inode);
|
|
|
|
|
if (ret || wbc->nr_to_write <= 0)
|
2020-07-21 10:22:12 -04:00
|
|
|
goto out;
|
|
|
|
|
}
|
2008-09-26 10:05:38 -04:00
|
|
|
cond_resched();
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
2008-08-04 23:17:27 -04:00
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
2008-09-29 11:19:10 -04:00
|
|
|
|
2014-04-02 19:53:32 +08:00
|
|
|
out:
|
2013-05-15 07:48:22 +00:00
|
|
|
list_for_each_entry_safe(work, next, &works, list) {
|
|
|
|
|
list_del_init(&work->list);
|
2018-04-19 10:46:39 +03:00
|
|
|
wait_for_completion(&work->completion);
|
|
|
|
|
kfree(work);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
|
|
2018-04-19 10:46:37 +03:00
|
|
|
if (!list_empty(&splice)) {
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
|
list_splice_tail(&splice, &root->delalloc_inodes);
|
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
|
}
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_unlock(&root->delalloc_mutex);
|
2013-05-15 07:48:22 +00:00
|
|
|
return ret;
|
|
|
|
|
}
|
2013-01-22 10:49:00 +00:00
|
|
|
|
btrfs: fix deadlock when cloning inline extents and using qgroups
There are a few exceptional cases where cloning an inline extent needs to
copy the inline extent data into a page of the destination inode.
When this happens, we end up starting a transaction while having a dirty
page for the destination inode and while having the range locked in the
destination's inode iotree too. Because when reserving metadata space
for a transaction we may need to flush existing delalloc in case there is
not enough free space, we have a mechanism in place to prevent a deadlock,
which was introduced in commit 3d45f221ce627d ("btrfs: fix deadlock when
cloning inline extent and low on free metadata space").
However when using qgroups, a transaction also reserves metadata qgroup
space, which can also result in flushing delalloc in case there is not
enough available space at the moment. When this happens we deadlock, since
flushing delalloc requires locking the file range in the inode's iotree
and the range was already locked at the very beginning of the clone
operation, before attempting to start the transaction.
When this issue happens, stack traces like the following are reported:
[72747.556262] task:kworker/u81:9 state:D stack: 0 pid: 225 ppid: 2 flags:0x00004000
[72747.556268] Workqueue: writeback wb_workfn (flush-btrfs-1142)
[72747.556271] Call Trace:
[72747.556273] __schedule+0x296/0x760
[72747.556277] schedule+0x3c/0xa0
[72747.556279] io_schedule+0x12/0x40
[72747.556284] __lock_page+0x13c/0x280
[72747.556287] ? generic_file_readonly_mmap+0x70/0x70
[72747.556325] extent_write_cache_pages+0x22a/0x440 [btrfs]
[72747.556331] ? __set_page_dirty_nobuffers+0xe7/0x160
[72747.556358] ? set_extent_buffer_dirty+0x5e/0x80 [btrfs]
[72747.556362] ? update_group_capacity+0x25/0x210
[72747.556366] ? cpumask_next_and+0x1a/0x20
[72747.556391] extent_writepages+0x44/0xa0 [btrfs]
[72747.556394] do_writepages+0x41/0xd0
[72747.556398] __writeback_single_inode+0x39/0x2a0
[72747.556403] writeback_sb_inodes+0x1ea/0x440
[72747.556407] __writeback_inodes_wb+0x5f/0xc0
[72747.556410] wb_writeback+0x235/0x2b0
[72747.556414] ? get_nr_inodes+0x35/0x50
[72747.556417] wb_workfn+0x354/0x490
[72747.556420] ? newidle_balance+0x2c5/0x3e0
[72747.556424] process_one_work+0x1aa/0x340
[72747.556426] worker_thread+0x30/0x390
[72747.556429] ? create_worker+0x1a0/0x1a0
[72747.556432] kthread+0x116/0x130
[72747.556435] ? kthread_park+0x80/0x80
[72747.556438] ret_from_fork+0x1f/0x30
[72747.566958] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[72747.566961] Call Trace:
[72747.566964] __schedule+0x296/0x760
[72747.566968] ? finish_wait+0x80/0x80
[72747.566970] schedule+0x3c/0xa0
[72747.566995] wait_extent_bit.constprop.68+0x13b/0x1c0 [btrfs]
[72747.566999] ? finish_wait+0x80/0x80
[72747.567024] lock_extent_bits+0x37/0x90 [btrfs]
[72747.567047] btrfs_invalidatepage+0x299/0x2c0 [btrfs]
[72747.567051] ? find_get_pages_range_tag+0x2cd/0x380
[72747.567076] __extent_writepage+0x203/0x320 [btrfs]
[72747.567102] extent_write_cache_pages+0x2bb/0x440 [btrfs]
[72747.567106] ? update_load_avg+0x7e/0x5f0
[72747.567109] ? enqueue_entity+0xf4/0x6f0
[72747.567134] extent_writepages+0x44/0xa0 [btrfs]
[72747.567137] ? enqueue_task_fair+0x93/0x6f0
[72747.567140] do_writepages+0x41/0xd0
[72747.567144] __filemap_fdatawrite_range+0xc7/0x100
[72747.567167] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
[72747.567195] btrfs_work_helper+0xc2/0x300 [btrfs]
[72747.567200] process_one_work+0x1aa/0x340
[72747.567202] worker_thread+0x30/0x390
[72747.567205] ? create_worker+0x1a0/0x1a0
[72747.567208] kthread+0x116/0x130
[72747.567211] ? kthread_park+0x80/0x80
[72747.567214] ret_from_fork+0x1f/0x30
[72747.569686] task:fsstress state:D stack: 0 pid:841421 ppid:841417 flags:0x00000000
[72747.569689] Call Trace:
[72747.569691] __schedule+0x296/0x760
[72747.569694] schedule+0x3c/0xa0
[72747.569721] try_flush_qgroup+0x95/0x140 [btrfs]
[72747.569725] ? finish_wait+0x80/0x80
[72747.569753] btrfs_qgroup_reserve_data+0x34/0x50 [btrfs]
[72747.569781] btrfs_check_data_free_space+0x5f/0xa0 [btrfs]
[72747.569804] btrfs_buffered_write+0x1f7/0x7f0 [btrfs]
[72747.569810] ? path_lookupat.isra.48+0x97/0x140
[72747.569833] btrfs_file_write_iter+0x81/0x410 [btrfs]
[72747.569836] ? __kmalloc+0x16a/0x2c0
[72747.569839] do_iter_readv_writev+0x160/0x1c0
[72747.569843] do_iter_write+0x80/0x1b0
[72747.569847] vfs_writev+0x84/0x140
[72747.569869] ? btrfs_file_llseek+0x38/0x270 [btrfs]
[72747.569873] do_writev+0x65/0x100
[72747.569876] do_syscall_64+0x33/0x40
[72747.569879] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[72747.569899] task:fsstress state:D stack: 0 pid:841424 ppid:841417 flags:0x00004000
[72747.569903] Call Trace:
[72747.569906] __schedule+0x296/0x760
[72747.569909] schedule+0x3c/0xa0
[72747.569936] try_flush_qgroup+0x95/0x140 [btrfs]
[72747.569940] ? finish_wait+0x80/0x80
[72747.569967] __btrfs_qgroup_reserve_meta+0x36/0x50 [btrfs]
[72747.569989] start_transaction+0x279/0x580 [btrfs]
[72747.570014] clone_copy_inline_extent+0x332/0x490 [btrfs]
[72747.570041] btrfs_clone+0x5b7/0x7a0 [btrfs]
[72747.570068] ? lock_extent_bits+0x64/0x90 [btrfs]
[72747.570095] btrfs_clone_files+0xfc/0x150 [btrfs]
[72747.570122] btrfs_remap_file_range+0x3d8/0x4a0 [btrfs]
[72747.570126] do_clone_file_range+0xed/0x200
[72747.570131] vfs_clone_file_range+0x37/0x110
[72747.570134] ioctl_file_clone+0x7d/0xb0
[72747.570137] do_vfs_ioctl+0x138/0x630
[72747.570140] __x64_sys_ioctl+0x62/0xc0
[72747.570143] do_syscall_64+0x33/0x40
[72747.570146] entry_SYSCALL_64_after_hwframe+0x44/0xa9
So fix this by skipping the flush of delalloc for an inode that is
flagged with BTRFS_INODE_NO_DELALLOC_FLUSH, meaning it is currently under
such a special case of cloning an inline extent, when flushing delalloc
during qgroup metadata reservation.
The special cases for cloning inline extents were added in kernel 5.7 by
by commit 05a5a7621ce66c ("Btrfs: implement full reflink support for
inline extents"), while having qgroup metadata space reservation flushing
delalloc when low on space was added in kernel 5.9 by commit
c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get
-EDQUOT"). So use a "Fixes:" tag for the later commit to ease stable
kernel backports.
Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20210421083137.31E3.409509F4@e16-tech.com/
Fixes: c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
CC: stable@vger.kernel.org # 5.9+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-22 12:08:05 +01:00
|
|
|
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
|
2013-05-15 07:48:22 +00:00
|
|
|
{
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
struct writeback_control wbc = {
|
|
|
|
|
.nr_to_write = LONG_MAX,
|
|
|
|
|
.sync_mode = WB_SYNC_NONE,
|
|
|
|
|
.range_start = 0,
|
|
|
|
|
.range_end = LLONG_MAX,
|
|
|
|
|
};
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2021-10-05 16:35:25 -04:00
|
|
|
if (BTRFS_FS_ERROR(fs_info))
|
2013-05-15 07:48:22 +00:00
|
|
|
return -EROFS;
|
|
|
|
|
|
btrfs: fix deadlock when cloning inline extents and using qgroups
There are a few exceptional cases where cloning an inline extent needs to
copy the inline extent data into a page of the destination inode.
When this happens, we end up starting a transaction while having a dirty
page for the destination inode and while having the range locked in the
destination's inode iotree too. Because when reserving metadata space
for a transaction we may need to flush existing delalloc in case there is
not enough free space, we have a mechanism in place to prevent a deadlock,
which was introduced in commit 3d45f221ce627d ("btrfs: fix deadlock when
cloning inline extent and low on free metadata space").
However when using qgroups, a transaction also reserves metadata qgroup
space, which can also result in flushing delalloc in case there is not
enough available space at the moment. When this happens we deadlock, since
flushing delalloc requires locking the file range in the inode's iotree
and the range was already locked at the very beginning of the clone
operation, before attempting to start the transaction.
When this issue happens, stack traces like the following are reported:
[72747.556262] task:kworker/u81:9 state:D stack: 0 pid: 225 ppid: 2 flags:0x00004000
[72747.556268] Workqueue: writeback wb_workfn (flush-btrfs-1142)
[72747.556271] Call Trace:
[72747.556273] __schedule+0x296/0x760
[72747.556277] schedule+0x3c/0xa0
[72747.556279] io_schedule+0x12/0x40
[72747.556284] __lock_page+0x13c/0x280
[72747.556287] ? generic_file_readonly_mmap+0x70/0x70
[72747.556325] extent_write_cache_pages+0x22a/0x440 [btrfs]
[72747.556331] ? __set_page_dirty_nobuffers+0xe7/0x160
[72747.556358] ? set_extent_buffer_dirty+0x5e/0x80 [btrfs]
[72747.556362] ? update_group_capacity+0x25/0x210
[72747.556366] ? cpumask_next_and+0x1a/0x20
[72747.556391] extent_writepages+0x44/0xa0 [btrfs]
[72747.556394] do_writepages+0x41/0xd0
[72747.556398] __writeback_single_inode+0x39/0x2a0
[72747.556403] writeback_sb_inodes+0x1ea/0x440
[72747.556407] __writeback_inodes_wb+0x5f/0xc0
[72747.556410] wb_writeback+0x235/0x2b0
[72747.556414] ? get_nr_inodes+0x35/0x50
[72747.556417] wb_workfn+0x354/0x490
[72747.556420] ? newidle_balance+0x2c5/0x3e0
[72747.556424] process_one_work+0x1aa/0x340
[72747.556426] worker_thread+0x30/0x390
[72747.556429] ? create_worker+0x1a0/0x1a0
[72747.556432] kthread+0x116/0x130
[72747.556435] ? kthread_park+0x80/0x80
[72747.556438] ret_from_fork+0x1f/0x30
[72747.566958] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[72747.566961] Call Trace:
[72747.566964] __schedule+0x296/0x760
[72747.566968] ? finish_wait+0x80/0x80
[72747.566970] schedule+0x3c/0xa0
[72747.566995] wait_extent_bit.constprop.68+0x13b/0x1c0 [btrfs]
[72747.566999] ? finish_wait+0x80/0x80
[72747.567024] lock_extent_bits+0x37/0x90 [btrfs]
[72747.567047] btrfs_invalidatepage+0x299/0x2c0 [btrfs]
[72747.567051] ? find_get_pages_range_tag+0x2cd/0x380
[72747.567076] __extent_writepage+0x203/0x320 [btrfs]
[72747.567102] extent_write_cache_pages+0x2bb/0x440 [btrfs]
[72747.567106] ? update_load_avg+0x7e/0x5f0
[72747.567109] ? enqueue_entity+0xf4/0x6f0
[72747.567134] extent_writepages+0x44/0xa0 [btrfs]
[72747.567137] ? enqueue_task_fair+0x93/0x6f0
[72747.567140] do_writepages+0x41/0xd0
[72747.567144] __filemap_fdatawrite_range+0xc7/0x100
[72747.567167] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
[72747.567195] btrfs_work_helper+0xc2/0x300 [btrfs]
[72747.567200] process_one_work+0x1aa/0x340
[72747.567202] worker_thread+0x30/0x390
[72747.567205] ? create_worker+0x1a0/0x1a0
[72747.567208] kthread+0x116/0x130
[72747.567211] ? kthread_park+0x80/0x80
[72747.567214] ret_from_fork+0x1f/0x30
[72747.569686] task:fsstress state:D stack: 0 pid:841421 ppid:841417 flags:0x00000000
[72747.569689] Call Trace:
[72747.569691] __schedule+0x296/0x760
[72747.569694] schedule+0x3c/0xa0
[72747.569721] try_flush_qgroup+0x95/0x140 [btrfs]
[72747.569725] ? finish_wait+0x80/0x80
[72747.569753] btrfs_qgroup_reserve_data+0x34/0x50 [btrfs]
[72747.569781] btrfs_check_data_free_space+0x5f/0xa0 [btrfs]
[72747.569804] btrfs_buffered_write+0x1f7/0x7f0 [btrfs]
[72747.569810] ? path_lookupat.isra.48+0x97/0x140
[72747.569833] btrfs_file_write_iter+0x81/0x410 [btrfs]
[72747.569836] ? __kmalloc+0x16a/0x2c0
[72747.569839] do_iter_readv_writev+0x160/0x1c0
[72747.569843] do_iter_write+0x80/0x1b0
[72747.569847] vfs_writev+0x84/0x140
[72747.569869] ? btrfs_file_llseek+0x38/0x270 [btrfs]
[72747.569873] do_writev+0x65/0x100
[72747.569876] do_syscall_64+0x33/0x40
[72747.569879] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[72747.569899] task:fsstress state:D stack: 0 pid:841424 ppid:841417 flags:0x00004000
[72747.569903] Call Trace:
[72747.569906] __schedule+0x296/0x760
[72747.569909] schedule+0x3c/0xa0
[72747.569936] try_flush_qgroup+0x95/0x140 [btrfs]
[72747.569940] ? finish_wait+0x80/0x80
[72747.569967] __btrfs_qgroup_reserve_meta+0x36/0x50 [btrfs]
[72747.569989] start_transaction+0x279/0x580 [btrfs]
[72747.570014] clone_copy_inline_extent+0x332/0x490 [btrfs]
[72747.570041] btrfs_clone+0x5b7/0x7a0 [btrfs]
[72747.570068] ? lock_extent_bits+0x64/0x90 [btrfs]
[72747.570095] btrfs_clone_files+0xfc/0x150 [btrfs]
[72747.570122] btrfs_remap_file_range+0x3d8/0x4a0 [btrfs]
[72747.570126] do_clone_file_range+0xed/0x200
[72747.570131] vfs_clone_file_range+0x37/0x110
[72747.570134] ioctl_file_clone+0x7d/0xb0
[72747.570137] do_vfs_ioctl+0x138/0x630
[72747.570140] __x64_sys_ioctl+0x62/0xc0
[72747.570143] do_syscall_64+0x33/0x40
[72747.570146] entry_SYSCALL_64_after_hwframe+0x44/0xa9
So fix this by skipping the flush of delalloc for an inode that is
flagged with BTRFS_INODE_NO_DELALLOC_FLUSH, meaning it is currently under
such a special case of cloning an inline extent, when flushing delalloc
during qgroup metadata reservation.
The special cases for cloning inline extents were added in kernel 5.7 by
by commit 05a5a7621ce66c ("Btrfs: implement full reflink support for
inline extents"), while having qgroup metadata space reservation flushing
delalloc when low on space was added in kernel 5.9 by commit
c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get
-EDQUOT"). So use a "Fixes:" tag for the later commit to ease stable
kernel backports.
Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20210421083137.31E3.409509F4@e16-tech.com/
Fixes: c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
CC: stable@vger.kernel.org # 5.9+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-22 12:08:05 +01:00
|
|
|
return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
|
|
2021-01-11 12:58:11 +02:00
|
|
|
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
|
btrfs: fix deadlock when cloning inline extent and low on free metadata space
When cloning an inline extent there are cases where we can not just copy
the inline extent from the source range to the target range (e.g. when the
target range starts at an offset greater than zero). In such cases we copy
the inline extent's data into a page of the destination inode and then
dirty that page. However, after that we will need to start a transaction
for each processed extent and, if we are ever low on available metadata
space, we may need to flush existing delalloc for all dirty inodes in an
attempt to release metadata space - if that happens we may deadlock:
* the async reclaim task queued a delalloc work to flush delalloc for
the destination inode of the clone operation;
* the task executing that delalloc work gets blocked waiting for the
range with the dirty page to be unlocked, which is currently locked
by the task doing the clone operation;
* the async reclaim task blocks waiting for the delalloc work to complete;
* the cloning task is waiting on the waitqueue of its reservation ticket
while holding the range with the dirty page locked in the inode's
io_tree;
* if metadata space is not released by some other task (like delalloc for
some other inode completing for example), the clone task waits forever
and as a consequence the delalloc work and async reclaim tasks will hang
forever as well. Releasing more space on the other hand may require
starting a transaction, which will hang as well when trying to reserve
metadata space, resulting in a deadlock between all these tasks.
When this happens, traces like the following show up in dmesg/syslog:
[87452.323003] INFO: task kworker/u16:11:1810830 blocked for more than 120 seconds.
[87452.323644] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
[87452.324248] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[87452.324852] task:kworker/u16:11 state:D stack: 0 pid:1810830 ppid: 2 flags:0x00004000
[87452.325520] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[87452.326136] Call Trace:
[87452.326737] __schedule+0x5d1/0xcf0
[87452.327390] schedule+0x45/0xe0
[87452.328174] lock_extent_bits+0x1e6/0x2d0 [btrfs]
[87452.328894] ? finish_wait+0x90/0x90
[87452.329474] btrfs_invalidatepage+0x32c/0x390 [btrfs]
[87452.330133] ? __mod_memcg_state+0x8e/0x160
[87452.330738] __extent_writepage+0x2d4/0x400 [btrfs]
[87452.331405] extent_write_cache_pages+0x2b2/0x500 [btrfs]
[87452.332007] ? lock_release+0x20e/0x4c0
[87452.332557] ? trace_hardirqs_on+0x1b/0xf0
[87452.333127] extent_writepages+0x43/0x90 [btrfs]
[87452.333653] ? lock_acquire+0x1a3/0x490
[87452.334177] do_writepages+0x43/0xe0
[87452.334699] ? __filemap_fdatawrite_range+0xa4/0x100
[87452.335720] __filemap_fdatawrite_range+0xc5/0x100
[87452.336500] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
[87452.337216] btrfs_work_helper+0xf1/0x600 [btrfs]
[87452.337838] process_one_work+0x24e/0x5e0
[87452.338437] worker_thread+0x50/0x3b0
[87452.339137] ? process_one_work+0x5e0/0x5e0
[87452.339884] kthread+0x153/0x170
[87452.340507] ? kthread_mod_delayed_work+0xc0/0xc0
[87452.341153] ret_from_fork+0x22/0x30
[87452.341806] INFO: task kworker/u16:1:2426217 blocked for more than 120 seconds.
[87452.342487] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
[87452.343274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[87452.344049] task:kworker/u16:1 state:D stack: 0 pid:2426217 ppid: 2 flags:0x00004000
[87452.344974] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
[87452.345655] Call Trace:
[87452.346305] __schedule+0x5d1/0xcf0
[87452.346947] ? kvm_clock_read+0x14/0x30
[87452.347676] ? wait_for_completion+0x81/0x110
[87452.348389] schedule+0x45/0xe0
[87452.349077] schedule_timeout+0x30c/0x580
[87452.349718] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[87452.350340] ? lock_acquire+0x1a3/0x490
[87452.351006] ? try_to_wake_up+0x7a/0xa20
[87452.351541] ? lock_release+0x20e/0x4c0
[87452.352040] ? lock_acquired+0x199/0x490
[87452.352517] ? wait_for_completion+0x81/0x110
[87452.353000] wait_for_completion+0xab/0x110
[87452.353490] start_delalloc_inodes+0x2af/0x390 [btrfs]
[87452.353973] btrfs_start_delalloc_roots+0x12d/0x250 [btrfs]
[87452.354455] flush_space+0x24f/0x660 [btrfs]
[87452.355063] btrfs_async_reclaim_metadata_space+0x1bb/0x480 [btrfs]
[87452.355565] process_one_work+0x24e/0x5e0
[87452.356024] worker_thread+0x20f/0x3b0
[87452.356487] ? process_one_work+0x5e0/0x5e0
[87452.356973] kthread+0x153/0x170
[87452.357434] ? kthread_mod_delayed_work+0xc0/0xc0
[87452.357880] ret_from_fork+0x22/0x30
(...)
< stack traces of several tasks waiting for the locks of the inodes of the
clone operation >
(...)
[92867.444138] RSP: 002b:00007ffc3371bbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
[92867.444624] RAX: ffffffffffffffda RBX: 00007ffc3371bea0 RCX: 00007f61efe73f97
[92867.445116] RDX: 0000000000000000 RSI: 0000560fbd5d7a40 RDI: 0000560fbd5d8960
[92867.445595] RBP: 00007ffc3371beb0 R08: 0000000000000001 R09: 0000000000000003
[92867.446070] R10: 00007ffc3371b996 R11: 0000000000000246 R12: 0000000000000000
[92867.446820] R13: 000000000000001f R14: 00007ffc3371bea0 R15: 00007ffc3371beb0
[92867.447361] task:fsstress state:D stack: 0 pid:2508238 ppid:2508153 flags:0x00004000
[92867.447920] Call Trace:
[92867.448435] __schedule+0x5d1/0xcf0
[92867.448934] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[92867.449423] schedule+0x45/0xe0
[92867.449916] __reserve_bytes+0x4a4/0xb10 [btrfs]
[92867.450576] ? finish_wait+0x90/0x90
[92867.451202] btrfs_reserve_metadata_bytes+0x29/0x190 [btrfs]
[92867.451815] btrfs_block_rsv_add+0x1f/0x50 [btrfs]
[92867.452412] start_transaction+0x2d1/0x760 [btrfs]
[92867.453216] clone_copy_inline_extent+0x333/0x490 [btrfs]
[92867.453848] ? lock_release+0x20e/0x4c0
[92867.454539] ? btrfs_search_slot+0x9a7/0xc30 [btrfs]
[92867.455218] btrfs_clone+0x569/0x7e0 [btrfs]
[92867.455952] btrfs_clone_files+0xf6/0x150 [btrfs]
[92867.456588] btrfs_remap_file_range+0x324/0x3d0 [btrfs]
[92867.457213] do_clone_file_range+0xd4/0x1f0
[92867.457828] vfs_clone_file_range+0x4d/0x230
[92867.458355] ? lock_release+0x20e/0x4c0
[92867.458890] ioctl_file_clone+0x8f/0xc0
[92867.459377] do_vfs_ioctl+0x342/0x750
[92867.459913] __x64_sys_ioctl+0x62/0xb0
[92867.460377] do_syscall_64+0x33/0x80
[92867.460842] entry_SYSCALL_64_after_hwframe+0x44/0xa9
(...)
< stack traces of more tasks blocked on metadata reservation like the clone
task above, because the async reclaim task has deadlocked >
(...)
Another thing to notice is that the worker task that is deadlocked when
trying to flush the destination inode of the clone operation is at
btrfs_invalidatepage(). This is simply because the clone operation has a
destination offset greater than the i_size and we only update the i_size
of the destination file after cloning an extent (just like we do in the
buffered write path).
Since the async reclaim path uses btrfs_start_delalloc_roots() to trigger
the flushing of delalloc for all inodes that have delalloc, add a runtime
flag to an inode to signal it should not be flushed, and for inodes with
that flag set, start_delalloc_inodes() will simply skip them. When the
cloning code needs to dirty a page to copy an inline extent, set that flag
on the inode and then clear it when the clone operation finishes.
This could be sporadically triggered with test case generic/269 from
fstests, which exercises many fsstress processes running in parallel with
several dd processes filling up the entire filesystem.
CC: stable@vger.kernel.org # 5.9+
Fixes: 05a5a7621ce6 ("Btrfs: implement full reflink support for inline extents")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-02 11:55:58 +00:00
|
|
|
bool in_reclaim_context)
|
2013-05-15 07:48:22 +00:00
|
|
|
{
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
struct writeback_control wbc = {
|
2021-01-11 12:58:11 +02:00
|
|
|
.nr_to_write = nr,
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
.sync_mode = WB_SYNC_NONE,
|
|
|
|
|
.range_start = 0,
|
|
|
|
|
.range_end = LLONG_MAX,
|
|
|
|
|
};
|
2013-05-15 07:48:22 +00:00
|
|
|
struct btrfs_root *root;
|
|
|
|
|
struct list_head splice;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2021-10-05 16:35:25 -04:00
|
|
|
if (BTRFS_FS_ERROR(fs_info))
|
2013-05-15 07:48:22 +00:00
|
|
|
return -EROFS;
|
|
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&splice);
|
|
|
|
|
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_lock(&fs_info->delalloc_root_mutex);
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
|
|
|
|
list_splice_init(&fs_info->delalloc_roots, &splice);
|
2021-01-11 12:58:12 +02:00
|
|
|
while (!list_empty(&splice)) {
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
/*
|
|
|
|
|
* Reset nr_to_write here so we know that we're doing a full
|
|
|
|
|
* flush.
|
|
|
|
|
*/
|
2021-01-11 12:58:11 +02:00
|
|
|
if (nr == LONG_MAX)
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
wbc.nr_to_write = LONG_MAX;
|
|
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
root = list_first_entry(&splice, struct btrfs_root,
|
|
|
|
|
delalloc_root);
|
2020-01-24 09:33:01 -05:00
|
|
|
root = btrfs_grab_root(root);
|
2013-05-15 07:48:22 +00:00
|
|
|
BUG_ON(!root);
|
|
|
|
|
list_move_tail(&root->delalloc_root,
|
|
|
|
|
&fs_info->delalloc_roots);
|
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
|
|
|
|
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
|
2020-01-24 09:33:01 -05:00
|
|
|
btrfs_put_root(root);
|
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. To reproduce the user untarred the source
tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would
see it take anywhere from 5 to 20 times as long to untar in 5.10
compared to 5.9. This was observed on fast devices (SSD and better) and
not on HDD.
The root cause is because before we would generally use the normal
writeback path to reclaim delalloc space, and for this we would provide
it with the number of pages we wanted to flush. The referenced commit
changed this to flush that many inodes, which drastically increased the
amount of space we were flushing in certain cases, which severely
affected performance.
We cannot revert this patch unfortunately because of 3d45f221ce62
("btrfs: fix deadlock when cloning inline extent and low on free
metadata space") which requires the ability to skip flushing inodes that
are being cloned in certain scenarios, which means we need to keep using
our flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. I
redid the users original test and got the following results on one of
our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive)
5.9 0m54.258s
5.10 1m26.212s
5.10+patch 0m38.800s
5.10+patch is significantly faster than plain 5.9 because of my patch
series "Change data reservations to use the ticketing infra" which
contained the patch that introduced the regression, but generally
improved the overall ENOSPC flushing mechanisms.
Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm
the results:
5.10.5 4m00s
5.10.5+patch 1m08s
5.11-rc2 5m14s
5.11-rc2+patch 1m30s
Reported-by: René Rebe <rene@exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
CC: stable@vger.kernel.org # 5.10
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Tested-by: David Sterba <dsterba@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add my test results ]
Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-07 17:08:30 -05:00
|
|
|
if (ret < 0 || wbc.nr_to_write <= 0)
|
2013-05-15 07:48:22 +00:00
|
|
|
goto out;
|
|
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2012-10-25 09:28:04 +00:00
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2014-03-06 13:55:01 +08:00
|
|
|
ret = 0;
|
2013-05-15 07:48:22 +00:00
|
|
|
out:
|
2018-04-19 10:46:37 +03:00
|
|
|
if (!list_empty(&splice)) {
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
|
|
|
|
list_splice_tail(&splice, &fs_info->delalloc_roots);
|
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
}
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_unlock(&fs_info->delalloc_root_mutex);
|
2012-10-25 09:28:04 +00:00
|
|
|
return ret;
|
2008-08-04 23:17:27 -04:00
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
|
struct dentry *dentry, const char *symname)
|
2007-06-12 06:35:45 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct btrfs_key key;
|
2022-03-14 18:12:32 -07:00
|
|
|
struct inode *inode;
|
2022-03-14 18:12:34 -07:00
|
|
|
struct btrfs_new_inode_args new_inode_args = {
|
|
|
|
|
.dir = dir,
|
|
|
|
|
.dentry = dentry,
|
|
|
|
|
};
|
|
|
|
|
unsigned int trans_num_items;
|
2007-06-12 06:35:45 -04:00
|
|
|
int err;
|
|
|
|
|
int name_len;
|
|
|
|
|
int datasize;
|
2007-10-15 16:14:19 -04:00
|
|
|
unsigned long ptr;
|
2007-06-12 06:35:45 -04:00
|
|
|
struct btrfs_file_extent_item *ei;
|
2007-10-15 16:14:19 -04:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2013-09-16 09:53:28 +01:00
|
|
|
name_len = strlen(symname);
|
2016-06-22 18:54:23 -04:00
|
|
|
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
|
2007-06-12 06:35:45 -04:00
|
|
|
return -ENAMETOOLONG;
|
2007-12-21 16:27:21 -05:00
|
|
|
|
2022-03-14 18:12:32 -07:00
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (!inode)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
|
|
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
|
|
|
|
inode_nohighmem(inode);
|
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), name_len);
|
|
|
|
|
inode_set_bytes(inode, name_len);
|
2022-03-14 18:12:32 -07:00
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
new_inode_args.inode = inode;
|
|
|
|
|
err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (err)
|
|
|
|
|
goto out_inode;
|
2022-03-14 18:12:34 -07:00
|
|
|
/* 1 additional item for the inline extent */
|
|
|
|
|
trans_num_items++;
|
|
|
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, trans_num_items);
|
2022-03-14 18:12:32 -07:00
|
|
|
if (IS_ERR(trans)) {
|
2022-03-14 18:12:34 -07:00
|
|
|
err = PTR_ERR(trans);
|
|
|
|
|
goto out_new_inode_args;
|
2022-03-14 18:12:32 -07:00
|
|
|
}
|
2007-12-21 16:27:21 -05:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
err = btrfs_create_new_inode(trans, &new_inode_args);
|
2014-09-08 13:08:51 -07:00
|
|
|
if (err)
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
goto out;
|
2011-12-15 10:09:07 -05:00
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 10:38:47 -07:00
|
|
|
if (!path) {
|
|
|
|
|
err = -ENOMEM;
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
btrfs_abort_transaction(trans, err);
|
|
|
|
|
discard_new_inode(inode);
|
|
|
|
|
inode = NULL;
|
|
|
|
|
goto out;
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 10:38:47 -07:00
|
|
|
}
|
2017-01-10 20:35:31 +02:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2007-06-12 06:35:45 -04:00
|
|
|
key.offset = 0;
|
2014-06-04 18:41:45 +02:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
2007-06-12 06:35:45 -04:00
|
|
|
datasize = btrfs_file_extent_calc_inline_size(name_len);
|
|
|
|
|
err = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
|
datasize);
|
2007-06-22 14:16:25 -04:00
|
|
|
if (err) {
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
btrfs_abort_transaction(trans, err);
|
2011-05-14 07:10:51 +00:00
|
|
|
btrfs_free_path(path);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
discard_new_inode(inode);
|
|
|
|
|
inode = NULL;
|
|
|
|
|
goto out;
|
2007-06-22 14:16:25 -04:00
|
|
|
}
|
2007-10-15 16:14:19 -04:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
|
btrfs_set_file_extent_type(leaf, ei,
|
2007-06-12 06:35:45 -04:00
|
|
|
BTRFS_FILE_EXTENT_INLINE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 14:49:59 -04:00
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
|
|
|
|
|
|
2007-06-12 06:35:45 -04:00
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
2007-10-15 16:14:19 -04:00
|
|
|
write_extent_buffer(leaf, symname, ptr, name_len);
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 06:35:45 -04:00
|
|
|
btrfs_free_path(path);
|
2007-10-15 16:14:19 -04:00
|
|
|
|
2018-05-04 08:23:01 -04:00
|
|
|
d_instantiate_new(dentry, inode);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
err = 0;
|
|
|
|
|
out:
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2022-03-14 18:12:34 -07:00
|
|
|
out_new_inode_args:
|
|
|
|
|
btrfs_new_inode_args_destroy(&new_inode_args);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
out_inode:
|
|
|
|
|
if (err)
|
|
|
|
|
iput(inode);
|
2007-06-12 06:35:45 -04:00
|
|
|
return err;
|
|
|
|
|
}
|
2008-04-10 10:23:21 -04:00
|
|
|
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
static struct btrfs_trans_handle *insert_prealloc_file_extent(
|
|
|
|
|
struct btrfs_trans_handle *trans_in,
|
2020-11-02 16:48:54 +02:00
|
|
|
struct btrfs_inode *inode,
|
|
|
|
|
struct btrfs_key *ins,
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
u64 file_offset)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_file_extent_item stack_fi;
|
2020-09-08 11:27:22 +01:00
|
|
|
struct btrfs_replace_extent_info extent_info;
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
struct btrfs_trans_handle *trans = trans_in;
|
|
|
|
|
struct btrfs_path *path;
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
u64 start = ins->objectid;
|
|
|
|
|
u64 len = ins->offset;
|
btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
There is a piece of weird code in insert_prealloc_file_extent(), which
looks like:
ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, ret);
...
}
extent_info.is_new_extent = true;
extent_info.qgroup_reserved = ret;
...
Note how the variable @ret is abused here, and if anyone is adding code
just after btrfs_qgroup_release_data() call, it's super easy to
overwrite the @ret and cause tons of qgroup related bugs.
Fix such abuse by introducing new variable @qgroup_released, so that we
won't reuse the existing variable @ret.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:51 +08:00
|
|
|
int qgroup_released;
|
2020-06-10 09:04:41 +08:00
|
|
|
int ret;
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
|
|
|
|
|
memset(&stack_fi, 0, sizeof(stack_fi));
|
|
|
|
|
|
|
|
|
|
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
|
|
|
|
|
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
|
|
|
|
|
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
|
|
|
|
|
btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
|
|
|
|
|
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
|
|
|
|
|
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
|
|
|
|
|
/* Encryption and other encoding is reserved and all 0 */
|
|
|
|
|
|
btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
There is a piece of weird code in insert_prealloc_file_extent(), which
looks like:
ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, ret);
...
}
extent_info.is_new_extent = true;
extent_info.qgroup_reserved = ret;
...
Note how the variable @ret is abused here, and if anyone is adding code
just after btrfs_qgroup_release_data() call, it's super easy to
overwrite the @ret and cause tons of qgroup related bugs.
Fix such abuse by introducing new variable @qgroup_released, so that we
won't reuse the existing variable @ret.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:51 +08:00
|
|
|
qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
|
|
|
|
|
if (qgroup_released < 0)
|
|
|
|
|
return ERR_PTR(qgroup_released);
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
|
|
|
|
|
if (trans) {
|
2020-11-02 16:48:54 +02:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
file_offset, &stack_fi,
|
btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
There is a piece of weird code in insert_prealloc_file_extent(), which
looks like:
ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, ret);
...
}
extent_info.is_new_extent = true;
extent_info.qgroup_reserved = ret;
...
Note how the variable @ret is abused here, and if anyone is adding code
just after btrfs_qgroup_release_data() call, it's super easy to
overwrite the @ret and cause tons of qgroup related bugs.
Fix such abuse by introducing new variable @qgroup_released, so that we
won't reuse the existing variable @ret.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:51 +08:00
|
|
|
true, qgroup_released);
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
if (ret)
|
btrfs: fix qgroup data rsv leak caused by falloc failure
[BUG]
When running fsstress with only falloc workload, and a very low qgroup
limit set, we can get qgroup data rsv leak at unmount time.
BTRFS warning (device dm-0): qgroup 0/5 has unreleased space, type 0 rsv 20480
BTRFS error (device dm-0): qgroup reserved space leaked
The minimal reproducer looks like:
#!/bin/bash
dev=/dev/test/test
mnt="/mnt/btrfs"
fsstress=~/xfstests-dev/ltp/fsstress
runtime=8
workload()
{
umount $dev &> /dev/null
umount $mnt &> /dev/null
mkfs.btrfs -f $dev > /dev/null
mount $dev $mnt
btrfs quota en $mnt
btrfs quota rescan -w $mnt
btrfs qgroup limit 16m 0/5 $mnt
$fsstress -w -z -f creat=10 -f fallocate=10 -p 2 -n 100 \
-d $mnt -v > /tmp/fsstress
umount $mnt
if dmesg | grep leak ; then
echo "!!! FAILED !!!"
exit 1
fi
}
for (( i=0; i < $runtime; i++)); do
echo "=== $i/$runtime==="
workload
done
Normally it would fail before round 4.
[CAUSE]
In function insert_prealloc_file_extent(), we first call
btrfs_qgroup_release_data() to know how many bytes are reserved for
qgroup data rsv.
Then use that @qgroup_released number to continue our work.
But after we call btrfs_qgroup_release_data(), we should either queue
@qgroup_released to delayed ref or free them manually in error path.
Unfortunately, we lack the error handling to free the released bytes,
leaking qgroup data rsv.
All the error handling function outside won't help at all, as we have
released the range, meaning in inode io tree, the EXTENT_QGROUP_RESERVED
bit is already cleared, thus all btrfs_qgroup_free_data() call won't
free any data rsv.
[FIX]
Add free_qgroup tag to manually free the released qgroup data rsv.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Reported-by: David Sterba <dsterba@suse.cz>
Fixes: 9729f10a608f ("btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent()")
CC: stable@vger.kernel.org # 5.10+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:52 +08:00
|
|
|
goto free_qgroup;
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
return trans;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
extent_info.disk_offset = start;
|
|
|
|
|
extent_info.disk_len = len;
|
|
|
|
|
extent_info.data_offset = 0;
|
|
|
|
|
extent_info.data_len = len;
|
|
|
|
|
extent_info.file_offset = file_offset;
|
|
|
|
|
extent_info.extent_buf = (char *)&stack_fi;
|
|
|
|
|
extent_info.is_new_extent = true;
|
btrfs: add missing inode updates on each iteration when replacing extents
When replacing file extents, called during fallocate, hole punching,
clone and deduplication, we may not be able to replace/drop all the
target file extent items with a single transaction handle. We may get
-ENOSPC while doing it, in which case we release the transaction handle,
balance the dirty pages of the btree inode, flush delayed items and get
a new transaction handle to operate on what's left of the target range.
By dropping and replacing file extent items we have effectively modified
the inode, so we should bump its iversion and update its mtime/ctime
before we update the inode item. This is because if the transaction
we used for partially modifying the inode gets committed by someone after
we release it and before we finish the rest of the range, a power failure
happens, then after mounting the filesystem our inode has an outdated
iversion and mtime/ctime, corresponding to the values it had before we
changed it.
So add the missing iversion and mtime/ctime updates.
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-06 10:41:18 +01:00
|
|
|
extent_info.update_times = true;
|
btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
There is a piece of weird code in insert_prealloc_file_extent(), which
looks like:
ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, ret);
...
}
extent_info.is_new_extent = true;
extent_info.qgroup_reserved = ret;
...
Note how the variable @ret is abused here, and if anyone is adding code
just after btrfs_qgroup_release_data() call, it's super easy to
overwrite the @ret and cause tons of qgroup related bugs.
Fix such abuse by introducing new variable @qgroup_released, so that we
won't reuse the existing variable @ret.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:51 +08:00
|
|
|
extent_info.qgroup_reserved = qgroup_released;
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
extent_info.insertions = 0;
|
|
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: fix qgroup data rsv leak caused by falloc failure
[BUG]
When running fsstress with only falloc workload, and a very low qgroup
limit set, we can get qgroup data rsv leak at unmount time.
BTRFS warning (device dm-0): qgroup 0/5 has unreleased space, type 0 rsv 20480
BTRFS error (device dm-0): qgroup reserved space leaked
The minimal reproducer looks like:
#!/bin/bash
dev=/dev/test/test
mnt="/mnt/btrfs"
fsstress=~/xfstests-dev/ltp/fsstress
runtime=8
workload()
{
umount $dev &> /dev/null
umount $mnt &> /dev/null
mkfs.btrfs -f $dev > /dev/null
mount $dev $mnt
btrfs quota en $mnt
btrfs quota rescan -w $mnt
btrfs qgroup limit 16m 0/5 $mnt
$fsstress -w -z -f creat=10 -f fallocate=10 -p 2 -n 100 \
-d $mnt -v > /tmp/fsstress
umount $mnt
if dmesg | grep leak ; then
echo "!!! FAILED !!!"
exit 1
fi
}
for (( i=0; i < $runtime; i++)); do
echo "=== $i/$runtime==="
workload
done
Normally it would fail before round 4.
[CAUSE]
In function insert_prealloc_file_extent(), we first call
btrfs_qgroup_release_data() to know how many bytes are reserved for
qgroup data rsv.
Then use that @qgroup_released number to continue our work.
But after we call btrfs_qgroup_release_data(), we should either queue
@qgroup_released to delayed ref or free them manually in error path.
Unfortunately, we lack the error handling to free the released bytes,
leaking qgroup data rsv.
All the error handling function outside won't help at all, as we have
released the range, meaning in inode io tree, the EXTENT_QGROUP_RESERVED
bit is already cleared, thus all btrfs_qgroup_free_data() call won't
free any data rsv.
[FIX]
Add free_qgroup tag to manually free the released qgroup data rsv.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Reported-by: David Sterba <dsterba@suse.cz>
Fixes: 9729f10a608f ("btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent()")
CC: stable@vger.kernel.org # 5.10+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:52 +08:00
|
|
|
if (!path) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto free_qgroup;
|
|
|
|
|
}
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
|
2021-02-17 15:12:47 +02:00
|
|
|
ret = btrfs_replace_file_extents(inode, path, file_offset,
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
file_offset + len - 1, &extent_info,
|
|
|
|
|
&trans);
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
if (ret)
|
btrfs: fix qgroup data rsv leak caused by falloc failure
[BUG]
When running fsstress with only falloc workload, and a very low qgroup
limit set, we can get qgroup data rsv leak at unmount time.
BTRFS warning (device dm-0): qgroup 0/5 has unreleased space, type 0 rsv 20480
BTRFS error (device dm-0): qgroup reserved space leaked
The minimal reproducer looks like:
#!/bin/bash
dev=/dev/test/test
mnt="/mnt/btrfs"
fsstress=~/xfstests-dev/ltp/fsstress
runtime=8
workload()
{
umount $dev &> /dev/null
umount $mnt &> /dev/null
mkfs.btrfs -f $dev > /dev/null
mount $dev $mnt
btrfs quota en $mnt
btrfs quota rescan -w $mnt
btrfs qgroup limit 16m 0/5 $mnt
$fsstress -w -z -f creat=10 -f fallocate=10 -p 2 -n 100 \
-d $mnt -v > /tmp/fsstress
umount $mnt
if dmesg | grep leak ; then
echo "!!! FAILED !!!"
exit 1
fi
}
for (( i=0; i < $runtime; i++)); do
echo "=== $i/$runtime==="
workload
done
Normally it would fail before round 4.
[CAUSE]
In function insert_prealloc_file_extent(), we first call
btrfs_qgroup_release_data() to know how many bytes are reserved for
qgroup data rsv.
Then use that @qgroup_released number to continue our work.
But after we call btrfs_qgroup_release_data(), we should either queue
@qgroup_released to delayed ref or free them manually in error path.
Unfortunately, we lack the error handling to free the released bytes,
leaking qgroup data rsv.
All the error handling function outside won't help at all, as we have
released the range, meaning in inode io tree, the EXTENT_QGROUP_RESERVED
bit is already cleared, thus all btrfs_qgroup_free_data() call won't
free any data rsv.
[FIX]
Add free_qgroup tag to manually free the released qgroup data rsv.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Reported-by: David Sterba <dsterba@suse.cz>
Fixes: 9729f10a608f ("btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent()")
CC: stable@vger.kernel.org # 5.10+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:52 +08:00
|
|
|
goto free_qgroup;
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
return trans;
|
btrfs: fix qgroup data rsv leak caused by falloc failure
[BUG]
When running fsstress with only falloc workload, and a very low qgroup
limit set, we can get qgroup data rsv leak at unmount time.
BTRFS warning (device dm-0): qgroup 0/5 has unreleased space, type 0 rsv 20480
BTRFS error (device dm-0): qgroup reserved space leaked
The minimal reproducer looks like:
#!/bin/bash
dev=/dev/test/test
mnt="/mnt/btrfs"
fsstress=~/xfstests-dev/ltp/fsstress
runtime=8
workload()
{
umount $dev &> /dev/null
umount $mnt &> /dev/null
mkfs.btrfs -f $dev > /dev/null
mount $dev $mnt
btrfs quota en $mnt
btrfs quota rescan -w $mnt
btrfs qgroup limit 16m 0/5 $mnt
$fsstress -w -z -f creat=10 -f fallocate=10 -p 2 -n 100 \
-d $mnt -v > /tmp/fsstress
umount $mnt
if dmesg | grep leak ; then
echo "!!! FAILED !!!"
exit 1
fi
}
for (( i=0; i < $runtime; i++)); do
echo "=== $i/$runtime==="
workload
done
Normally it would fail before round 4.
[CAUSE]
In function insert_prealloc_file_extent(), we first call
btrfs_qgroup_release_data() to know how many bytes are reserved for
qgroup data rsv.
Then use that @qgroup_released number to continue our work.
But after we call btrfs_qgroup_release_data(), we should either queue
@qgroup_released to delayed ref or free them manually in error path.
Unfortunately, we lack the error handling to free the released bytes,
leaking qgroup data rsv.
All the error handling function outside won't help at all, as we have
released the range, meaning in inode io tree, the EXTENT_QGROUP_RESERVED
bit is already cleared, thus all btrfs_qgroup_free_data() call won't
free any data rsv.
[FIX]
Add free_qgroup tag to manually free the released qgroup data rsv.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Reported-by: David Sterba <dsterba@suse.cz>
Fixes: 9729f10a608f ("btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent()")
CC: stable@vger.kernel.org # 5.10+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-03-03 18:41:52 +08:00
|
|
|
|
|
|
|
|
free_qgroup:
|
|
|
|
|
/*
|
|
|
|
|
* We have released qgroup data range at the beginning of the function,
|
|
|
|
|
* and normally qgroup_released bytes will be freed when committing
|
|
|
|
|
* transaction.
|
|
|
|
|
* But if we error out early, we have to free what we have released
|
|
|
|
|
* or we leak qgroup data reservation.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_qgroup_free_refroot(inode->root->fs_info,
|
|
|
|
|
inode->root->root_key.objectid, qgroup_released,
|
|
|
|
|
BTRFS_QGROUP_RSV_DATA);
|
|
|
|
|
return ERR_PTR(ret);
|
btrfs: inode: refactor the parameters of insert_reserved_file_extent()
Function insert_reserved_file_extent() takes a long list of parameters,
which are all for btrfs_file_extent_item, even including two reserved
members, encryption and other_encoding.
This makes the parameter list unnecessary long for a function which only
gets called twice.
This patch will refactor the parameter list, by using
btrfs_file_extent_item as parameter directly to hugely reduce the number
of parameters.
Also, since there are only two callers, one in btrfs_finish_ordered_io()
which inserts file extent for ordered extent, and one
__btrfs_prealloc_file_range().
These two call sites have completely different context, where ordered
extent can be compressed, but will always be regular extent, while the
preallocated one is never going to be compressed and always has PREALLOC
type.
So use two small wrapper for these two different call sites to improve
readability.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-06-10 09:04:40 +08:00
|
|
|
}
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
|
2010-06-21 14:48:16 -04:00
|
|
|
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
|
loff_t actual_len, u64 *alloc_hint,
|
|
|
|
|
struct btrfs_trans_handle *trans)
|
2008-10-30 14:25:28 -04:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
|
struct extent_map *em;
|
2008-10-30 14:25:28 -04:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_key ins;
|
|
|
|
|
u64 cur_offset = start;
|
2020-02-13 10:47:31 -05:00
|
|
|
u64 clear_offset = start;
|
2010-11-22 18:50:32 +00:00
|
|
|
u64 i_size;
|
2013-03-05 11:11:26 -05:00
|
|
|
u64 cur_bytes;
|
2015-09-23 17:11:16 -04:00
|
|
|
u64 last_alloc = (u64)-1;
|
2008-10-30 14:25:28 -04:00
|
|
|
int ret = 0;
|
2010-06-21 14:48:16 -04:00
|
|
|
bool own_trans = true;
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
u64 end = start + num_bytes - 1;
|
2008-10-30 14:25:28 -04:00
|
|
|
|
2010-06-21 14:48:16 -04:00
|
|
|
if (trans)
|
|
|
|
|
own_trans = false;
|
2008-10-30 14:25:28 -04:00
|
|
|
while (num_bytes > 0) {
|
2015-12-15 01:42:10 +09:00
|
|
|
cur_bytes = min_t(u64, num_bytes, SZ_256M);
|
2013-03-05 11:11:26 -05:00
|
|
|
cur_bytes = max(cur_bytes, min_size);
|
2015-09-23 17:11:16 -04:00
|
|
|
/*
|
|
|
|
|
* If we are severely fragmented we could end up with really
|
|
|
|
|
* small allocations, so if the allocator is returning small
|
|
|
|
|
* chunks lets make its job easier by only searching for those
|
|
|
|
|
* sized chunks.
|
|
|
|
|
*/
|
|
|
|
|
cur_bytes = min(cur_bytes, last_alloc);
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
|
|
|
|
|
min_size, 0, *alloc_hint, &ins, 1, 0);
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
if (ret)
|
2010-05-16 10:48:46 -04:00
|
|
|
break;
|
2020-02-13 10:47:31 -05:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We've reserved this space, and thus converted it from
|
|
|
|
|
* ->bytes_may_use to ->bytes_reserved. Any error that happens
|
|
|
|
|
* from here on out we will only need to clear our reservation
|
|
|
|
|
* for the remaining unreserved area, so advance our
|
|
|
|
|
* clear_offset by our extent size.
|
|
|
|
|
*/
|
|
|
|
|
clear_offset += ins.offset;
|
2009-11-12 09:34:52 +00:00
|
|
|
|
2015-09-23 17:11:16 -04:00
|
|
|
last_alloc = ins.offset;
|
2020-11-02 16:48:54 +02:00
|
|
|
trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
|
|
|
|
|
&ins, cur_offset);
|
btrfs: fix relocation failure due to race with fallocate
When doing a fallocate() we have a short time window, after reserving an
extent and before starting a transaction, where if relocation for the block
group containing the reserved extent happens, we can end up missing the
extent in the data relocation inode causing relocation to fail later.
This only happens when we don't pass a transaction to the internal
fallocate function __btrfs_prealloc_file_range(), which is for all the
cases where fallocate() is called from user space (the internal use cases
include space cache extent allocation and relocation).
When the race triggers the relocation failure, it produces a trace like
the following:
[200611.995995] ------------[ cut here ]------------
[200611.997084] BTRFS: Transaction aborted (error -2)
[200611.998208] WARNING: CPU: 3 PID: 235845 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x3a0/0x5b0 [btrfs]
[200611.999042] Modules linked in: dm_thin_pool dm_persistent_data (...)
[200612.003287] CPU: 3 PID: 235845 Comm: btrfs Not tainted 5.9.0-rc6-btrfs-next-69 #1
[200612.004442] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[200612.006186] RIP: 0010:__btrfs_cow_block+0x3a0/0x5b0 [btrfs]
[200612.007110] Code: 1b 00 00 02 72 2a 83 f8 fb 0f 84 b8 01 (...)
[200612.007341] BTRFS warning (device sdb): Skipping commit of aborted transaction.
[200612.008959] RSP: 0018:ffffaee38550f918 EFLAGS: 00010286
[200612.009672] BTRFS: error (device sdb) in cleanup_transaction:1901: errno=-30 Readonly filesystem
[200612.010428] RAX: 0000000000000000 RBX: ffff9174d96f4000 RCX: 0000000000000000
[200612.011078] BTRFS info (device sdb): forced readonly
[200612.011862] RDX: 0000000000000001 RSI: ffffffffa8161978 RDI: 00000000ffffffff
[200612.013215] RBP: ffff9172569a0f80 R08: 0000000000000000 R09: 0000000000000000
[200612.014263] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9174b8403b88
[200612.015203] R13: ffff9174b8400a88 R14: ffff9174c90f1000 R15: ffff9174a5a60e08
[200612.016182] FS: 00007fa55cf878c0(0000) GS:ffff9174ece00000(0000) knlGS:0000000000000000
[200612.017174] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[200612.018418] CR2: 00007f8fb8048148 CR3: 0000000428a46003 CR4: 00000000003706e0
[200612.019510] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[200612.020648] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[200612.021520] Call Trace:
[200612.022434] btrfs_cow_block+0x10b/0x250 [btrfs]
[200612.023407] do_relocation+0x54e/0x7b0 [btrfs]
[200612.024343] ? do_raw_spin_unlock+0x4b/0xc0
[200612.025280] ? _raw_spin_unlock+0x29/0x40
[200612.026200] relocate_tree_blocks+0x3bc/0x6d0 [btrfs]
[200612.027088] relocate_block_group+0x2f3/0x600 [btrfs]
[200612.027961] btrfs_relocate_block_group+0x15e/0x340 [btrfs]
[200612.028896] btrfs_relocate_chunk+0x38/0x110 [btrfs]
[200612.029772] btrfs_balance+0xb22/0x1790 [btrfs]
[200612.030601] ? btrfs_ioctl_balance+0x253/0x380 [btrfs]
[200612.031414] btrfs_ioctl_balance+0x2cf/0x380 [btrfs]
[200612.032279] btrfs_ioctl+0x620/0x36f0 [btrfs]
[200612.033077] ? _raw_spin_unlock+0x29/0x40
[200612.033948] ? handle_mm_fault+0x116d/0x1ca0
[200612.034749] ? up_read+0x18/0x240
[200612.035542] ? __x64_sys_ioctl+0x83/0xb0
[200612.036244] __x64_sys_ioctl+0x83/0xb0
[200612.037269] do_syscall_64+0x33/0x80
[200612.038190] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[200612.038976] RIP: 0033:0x7fa55d07ed87
[200612.040127] Code: 00 00 00 48 8b 05 09 91 0c 00 64 c7 00 26 (...)
[200612.041669] RSP: 002b:00007ffd5ebf03e8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
[200612.042437] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fa55d07ed87
[200612.043511] RDX: 00007ffd5ebf0470 RSI: 00000000c4009420 RDI: 0000000000000003
[200612.044250] RBP: 0000000000000003 R08: 000055d8362642a0 R09: 00007fa55d148be0
[200612.044963] R10: fffffffffffff52e R11: 0000000000000206 R12: 00007ffd5ebf1614
[200612.045683] R13: 00007ffd5ebf0470 R14: 0000000000000002 R15: 00007ffd5ebf0470
[200612.046361] irq event stamp: 0
[200612.047040] hardirqs last enabled at (0): [<0000000000000000>] 0x0
[200612.047725] hardirqs last disabled at (0): [<ffffffffa6eb5ab3>] copy_process+0x823/0x1bc0
[200612.048387] softirqs last enabled at (0): [<ffffffffa6eb5ab3>] copy_process+0x823/0x1bc0
[200612.049024] softirqs last disabled at (0): [<0000000000000000>] 0x0
[200612.049722] ---[ end trace 49006c6876e65227 ]---
The race happens like this:
1) Task A starts an fallocate() (plain or zero range) and it calls
__btrfs_prealloc_file_range() with the 'trans' parameter set to NULL;
2) Task A calls btrfs_reserve_extent() and gets an extent that belongs to
block group X;
3) Before task A gets into btrfs_replace_file_extents(), through the call
to insert_prealloc_file_extent(), task B starts relocation of block
group X;
4) Task B enters btrfs_relocate_block_group() and it sets block group X to
RO mode;
5) Task B enters relocate_block_group(), it calls prepare_to_relocate()
whichs joins/starts a transaction and then commits the transaction;
6) Task B then starts scanning the extent tree looking for extents that
belong to block group X - it does not find yet the extent reserved by
task A, since that extent was not yet added to the extent tree, as its
delayed reference was not even yet created at this point;
7) The data relocation inode ends up not having the extent reserved by
task A associated to it;
8) Task A then starts a transaction through btrfs_replace_file_extents(),
inserts a file extent item in the subvolume tree pointing to the
reserved extent and creates a delayed reference for it;
9) Task A finishes and returns success to user space;
10) Later on, while relocation is still in progress, the leaf where task A
inserted the new file extent item is COWed, so we end up at
__btrfs_cow_block(), which calls btrfs_reloc_cow_block(), and that in
turn calls relocation.c:replace_file_extents();
11) At relocation.c:replace_file_extents() we iterate over all the items in
the leaf and find the file extent item pointing to the extent that was
allocated by task A, and then call relocation.c:get_new_location(), to
find the new location for the extent;
12) However relocation.c:get_new_location() fails, returning -ENOENT,
because it couldn't find a corresponding file extent item associated
with the data relocation inode. This is because the extent was not seen
in the extent tree at step 6). The -ENOENT error is propagated to
__btrfs_cow_block(), which aborts the transaction.
So fix this simply by decrementing the block group's number of reservations
after calling insert_prealloc_file_extent(), as relocation waits for that
counter to go down to zero before calling prepare_to_relocate() and start
looking for extents in the extent tree.
This issue only started to happen recently as of commit 8fccebfa534c79
("btrfs: fix metadata reservation for fallocate that leads to transaction
aborts"), because now we can reserve an extent before starting/joining a
transaction, and previously we always did it after that, so relocation
ended up waiting for a concurrent fallocate() to finish because before
searching for the extents of the block group, it starts/joins a transaction
and then commits it (at prepare_to_relocate()), which made it wait for the
fallocate task to complete first.
Fixes: 8fccebfa534c79 ("btrfs: fix metadata reservation for fallocate that leads to transaction aborts")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-10-14 10:10:36 +01:00
|
|
|
/*
|
|
|
|
|
* Now that we inserted the prealloc extent we can finally
|
|
|
|
|
* decrement the number of reservations in the block group.
|
|
|
|
|
* If we did it before, we could race with relocation and have
|
|
|
|
|
* relocation miss the reserved extent, making it fail later.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
|
ret = PTR_ERR(trans);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
ins.offset, 0);
|
2012-03-12 16:03:00 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2014-12-12 16:44:35 +08:00
|
|
|
|
2017-02-20 13:50:45 +02:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
|
2009-09-11 12:27:37 -04:00
|
|
|
cur_offset + ins.offset -1, 0);
|
2009-11-12 09:34:52 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
em = alloc_extent_map();
|
|
|
|
|
if (!em) {
|
btrfs: reset last_reflink_trans after fsyncing inode
When an inode has a last_reflink_trans matching the current transaction,
we have to take special care when logging its checksums in order to
avoid getting checksum items with overlapping ranges in a log tree,
which could result in missing checksums after log replay (more on that
in the changelogs of commit 40e046acbd2f36 ("Btrfs: fix missing data
checksums after replaying a log tree") and commit e289f03ea79bbc ("btrfs:
fix corrupt log due to concurrent fsync of inodes with shared extents")).
We also need to make sure a full fsync will copy all old file extent
items it finds in modified leaves, because they might have been copied
from some other inode.
However once we fsync an inode, we don't need to keep paying the price of
that extra special care in future fsyncs done in the same transaction,
unless the inode is used for another reflink operation or the full sync
flag is set on it (truncate, failure to allocate extent maps for holes,
and other exceptional and infrequent cases).
So after we fsync an inode reset its last_unlink_trans to zero. In case
another reflink happens, we continue to update the last_reflink_trans of
the inode, just as before. Also set last_reflink_trans to the generation
of the last transaction that modified the inode whenever we need to set
the full sync flag on the inode, just like when we need to load an inode
from disk after eviction.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-02-17 12:12:06 +00:00
|
|
|
btrfs_set_inode_full_sync(BTRFS_I(inode));
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
goto next;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
em->start = cur_offset;
|
|
|
|
|
em->orig_start = cur_offset;
|
|
|
|
|
em->len = ins.offset;
|
|
|
|
|
em->block_start = ins.objectid;
|
|
|
|
|
em->block_len = ins.offset;
|
2012-12-03 10:31:19 -05:00
|
|
|
em->orig_block_len = ins.offset;
|
2013-04-04 14:31:27 -04:00
|
|
|
em->ram_bytes = ins.offset;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
|
|
|
|
|
em->generation = trans->transid;
|
|
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-05 16:51:15 -04:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
|
if (ret != -EEXIST)
|
|
|
|
|
break;
|
2017-02-20 13:50:45 +02:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 13:14:17 -04:00
|
|
|
cur_offset + ins.offset - 1,
|
|
|
|
|
0);
|
|
|
|
|
}
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
next:
|
2008-10-30 14:25:28 -04:00
|
|
|
num_bytes -= ins.offset;
|
|
|
|
|
cur_offset += ins.offset;
|
2010-05-16 10:49:59 -04:00
|
|
|
*alloc_hint = ins.objectid + ins.offset;
|
2009-11-12 09:34:52 +00:00
|
|
|
|
2012-04-05 15:03:02 -04:00
|
|
|
inode_inc_iversion(inode);
|
2016-09-14 07:48:06 -07:00
|
|
|
inode->i_ctime = current_time(inode);
|
2009-04-17 10:37:41 +02:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
|
2008-10-30 14:25:28 -04:00
|
|
|
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
2010-05-16 10:49:59 -04:00
|
|
|
(actual_len > inode->i_size) &&
|
|
|
|
|
(cur_offset > inode->i_size)) {
|
2010-01-20 07:28:54 +00:00
|
|
|
if (cur_offset > actual_len)
|
2010-11-22 18:50:32 +00:00
|
|
|
i_size = actual_len;
|
2010-01-20 07:28:54 +00:00
|
|
|
else
|
2010-11-22 18:50:32 +00:00
|
|
|
i_size = cur_offset;
|
|
|
|
|
i_size_write(inode, i_size);
|
2020-11-02 16:48:53 +02:00
|
|
|
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
|
2009-11-12 09:34:52 +00:00
|
|
|
}
|
|
|
|
|
|
2020-11-02 16:48:59 +02:00
|
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
2012-03-12 16:03:00 +01:00
|
|
|
|
|
|
|
|
if (ret) {
|
2016-06-10 18:19:25 -04:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 16:03:00 +01:00
|
|
|
if (own_trans)
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2012-03-12 16:03:00 +01:00
|
|
|
break;
|
|
|
|
|
}
|
2008-10-30 14:25:28 -04:00
|
|
|
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
if (own_trans) {
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 11:27:20 +01:00
|
|
|
trans = NULL;
|
|
|
|
|
}
|
2009-11-12 09:34:52 +00:00
|
|
|
}
|
2020-02-13 10:47:31 -05:00
|
|
|
if (clear_offset < end)
|
2020-06-03 08:55:39 +03:00
|
|
|
btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
|
2020-02-13 10:47:31 -05:00
|
|
|
end - clear_offset + 1);
|
2008-10-30 14:25:28 -04:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-21 14:48:16 -04:00
|
|
|
int btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
|
{
|
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
|
min_size, actual_len, alloc_hint,
|
|
|
|
|
NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int btrfs_prealloc_file_range_trans(struct inode *inode,
|
|
|
|
|
struct btrfs_trans_handle *trans, int mode,
|
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
|
{
|
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
|
min_size, actual_len, alloc_hint, trans);
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_permission(struct user_namespace *mnt_userns,
|
|
|
|
|
struct inode *inode, int mask)
|
2008-01-14 13:26:08 -05:00
|
|
|
{
|
2010-12-20 16:04:08 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-08-15 17:27:21 +00:00
|
|
|
umode_t mode = inode->i_mode;
|
2010-12-20 16:04:08 +08:00
|
|
|
|
2011-08-15 17:27:21 +00:00
|
|
|
if (mask & MAY_WRITE &&
|
|
|
|
|
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
|
|
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
|
return -EROFS;
|
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
|
|
|
|
|
return -EACCES;
|
|
|
|
|
}
|
2021-07-27 12:48:50 +02:00
|
|
|
return generic_permission(mnt_userns, inode, mask);
|
2008-01-14 13:26:08 -05:00
|
|
|
}
|
2007-06-12 06:35:45 -04:00
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
|
struct dentry *dentry, umode_t mode)
|
2014-04-27 20:40:45 +01:00
|
|
|
{
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2014-04-27 20:40:45 +01:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2022-03-14 18:12:32 -07:00
|
|
|
struct inode *inode;
|
2022-03-14 18:12:34 -07:00
|
|
|
struct btrfs_new_inode_args new_inode_args = {
|
|
|
|
|
.dir = dir,
|
|
|
|
|
.dentry = dentry,
|
|
|
|
|
.orphan = true,
|
|
|
|
|
};
|
|
|
|
|
unsigned int trans_num_items;
|
2022-03-14 18:12:32 -07:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
inode = new_inode(dir->i_sb);
|
|
|
|
|
if (!inode)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
inode_init_owner(mnt_userns, inode, dir, mode);
|
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2014-04-27 20:40:45 +01:00
|
|
|
|
2022-03-14 18:12:34 -07:00
|
|
|
new_inode_args.inode = inode;
|
|
|
|
|
ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
if (ret)
|
|
|
|
|
goto out_inode;
|
2022-03-14 18:12:34 -07:00
|
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, trans_num_items);
|
2022-03-14 18:12:32 -07:00
|
|
|
if (IS_ERR(trans)) {
|
2022-03-14 18:12:34 -07:00
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
|
goto out_new_inode_args;
|
2022-03-14 18:12:32 -07:00
|
|
|
}
|
2014-04-27 20:40:45 +01:00
|
|
|
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
ret = btrfs_create_new_inode(trans, &new_inode_args);
|
2014-04-27 20:40:45 +01:00
|
|
|
|
2014-08-01 00:10:32 +01:00
|
|
|
/*
|
2022-03-14 18:12:34 -07:00
|
|
|
* We set number of links to 0 in btrfs_create_new_inode(), and here we
|
|
|
|
|
* set it to 1 because d_tmpfile() will issue a warning if the count is
|
|
|
|
|
* 0, through:
|
2014-08-01 00:10:32 +01:00
|
|
|
*
|
|
|
|
|
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
|
|
|
|
|
*/
|
|
|
|
|
set_nlink(inode, 1);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
|
d_tmpfile(dentry, inode);
|
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
|
mark_inode_dirty(inode);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 18:54:24 -04:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2022-03-14 18:12:34 -07:00
|
|
|
out_new_inode_args:
|
|
|
|
|
btrfs_new_inode_args_destroy(&new_inode_args);
|
btrfs: move common inode creation code into btrfs_create_new_inode()
All of our inode creation code paths duplicate the calls to
btrfs_init_inode_security() and btrfs_add_link(). Subvolume creation
additionally duplicates property inheritance and the call to
btrfs_set_inode_index(). Fix this by moving the common code into
btrfs_create_new_inode(). This accomplishes a few things at once:
1. It reduces code duplication.
2. It allows us to set up the inode completely before inserting the
inode item, removing calls to btrfs_update_inode().
3. It fixes a leak of an inode on disk in some error cases. For example,
in btrfs_create(), if btrfs_new_inode() succeeds, then we have
inserted an inode item and its inode ref. However, if something after
that fails (e.g., btrfs_init_inode_security()), then we end the
transaction and then decrement the link count on the inode. If the
transaction is committed and the system crashes before the failed
inode is deleted, then we leak that inode on disk. Instead, this
refactoring aborts the transaction when we can't recover more
gracefully.
4. It exposes various ways that subvolume creation diverges from mkdir
in terms of inheriting flags, properties, permissions, and POSIX
ACLs, a lot of which appears to be accidental. This patch explicitly
does _not_ change the existing non-standard behavior, but it makes
those differences more clear in the code and documents them so that
we can discuss whether they should be changed.
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-14 18:12:35 -07:00
|
|
|
out_inode:
|
|
|
|
|
if (ret)
|
|
|
|
|
iput(inode);
|
2014-04-27 20:40:45 +01:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-31 16:50:49 +08:00
|
|
|
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
|
2017-05-05 11:57:13 -04:00
|
|
|
{
|
2021-05-31 16:50:49 +08:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
2017-05-05 11:57:13 -04:00
|
|
|
unsigned long index = start >> PAGE_SHIFT;
|
|
|
|
|
unsigned long end_index = end >> PAGE_SHIFT;
|
|
|
|
|
struct page *page;
|
2021-05-31 16:50:49 +08:00
|
|
|
u32 len;
|
2017-05-05 11:57:13 -04:00
|
|
|
|
2021-05-31 16:50:49 +08:00
|
|
|
ASSERT(end + 1 - start <= U32_MAX);
|
|
|
|
|
len = end + 1 - start;
|
2017-05-05 11:57:13 -04:00
|
|
|
while (index <= end_index) {
|
2021-05-31 16:50:49 +08:00
|
|
|
page = find_get_page(inode->vfs_inode.i_mapping, index);
|
2017-05-05 11:57:13 -04:00
|
|
|
ASSERT(page); /* Pages should be in the extent_io_tree */
|
2021-05-31 16:50:49 +08:00
|
|
|
|
|
|
|
|
btrfs_page_set_writeback(fs_info, page, start, len);
|
2017-05-05 11:57:13 -04:00
|
|
|
put_page(page);
|
|
|
|
|
index++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-17 10:25:42 -07:00
|
|
|
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
|
|
|
|
|
int compress_type)
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
{
|
|
|
|
|
switch (compress_type) {
|
|
|
|
|
case BTRFS_COMPRESS_NONE:
|
|
|
|
|
return BTRFS_ENCODED_IO_COMPRESSION_NONE;
|
|
|
|
|
case BTRFS_COMPRESS_ZLIB:
|
|
|
|
|
return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
|
|
|
|
|
case BTRFS_COMPRESS_LZO:
|
|
|
|
|
/*
|
|
|
|
|
* The LZO format depends on the sector size. 64K is the maximum
|
|
|
|
|
* sector size that we support.
|
|
|
|
|
*/
|
|
|
|
|
if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
|
|
|
|
|
(fs_info->sectorsize_bits - 12);
|
|
|
|
|
case BTRFS_COMPRESS_ZSTD:
|
|
|
|
|
return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
|
|
|
|
|
default:
|
|
|
|
|
return -EUCLEAN;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ssize_t btrfs_encoded_read_inline(
|
|
|
|
|
struct kiocb *iocb,
|
|
|
|
|
struct iov_iter *iter, u64 start,
|
|
|
|
|
u64 lockend,
|
|
|
|
|
struct extent_state **cached_state,
|
|
|
|
|
u64 extent_start, size_t count,
|
|
|
|
|
struct btrfs_ioctl_encoded_io_args *encoded,
|
|
|
|
|
bool *unlocked)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
|
struct btrfs_file_extent_item *item;
|
|
|
|
|
u64 ram_bytes;
|
|
|
|
|
unsigned long ptr;
|
|
|
|
|
void *tmp;
|
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
|
if (!path) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
|
|
|
|
|
extent_start, 0);
|
|
|
|
|
if (ret) {
|
|
|
|
|
if (ret > 0) {
|
|
|
|
|
/* The extent item disappeared? */
|
|
|
|
|
ret = -EIO;
|
|
|
|
|
}
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
|
|
ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
|
|
|
|
|
ptr = btrfs_file_extent_inline_start(item);
|
|
|
|
|
|
|
|
|
|
encoded->len = min_t(u64, extent_start + ram_bytes,
|
|
|
|
|
inode->vfs_inode.i_size) - iocb->ki_pos;
|
|
|
|
|
ret = btrfs_encoded_io_compression_from_extent(fs_info,
|
|
|
|
|
btrfs_file_extent_compression(leaf, item));
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
goto out;
|
|
|
|
|
encoded->compression = ret;
|
|
|
|
|
if (encoded->compression) {
|
|
|
|
|
size_t inline_size;
|
|
|
|
|
|
|
|
|
|
inline_size = btrfs_file_extent_inline_item_len(leaf,
|
|
|
|
|
path->slots[0]);
|
|
|
|
|
if (inline_size > count) {
|
|
|
|
|
ret = -ENOBUFS;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
count = inline_size;
|
|
|
|
|
encoded->unencoded_len = ram_bytes;
|
|
|
|
|
encoded->unencoded_offset = iocb->ki_pos - extent_start;
|
|
|
|
|
} else {
|
|
|
|
|
count = min_t(u64, count, encoded->len);
|
|
|
|
|
encoded->len = count;
|
|
|
|
|
encoded->unencoded_len = count;
|
|
|
|
|
ptr += iocb->ki_pos - extent_start;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tmp = kmalloc(count, GFP_NOFS);
|
|
|
|
|
if (!tmp) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
read_extent_buffer(leaf, tmp, ptr, count);
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
unlock_extent_cached(io_tree, start, lockend, cached_state);
|
|
|
|
|
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
|
|
|
|
|
*unlocked = true;
|
|
|
|
|
|
|
|
|
|
ret = copy_to_iter(tmp, count, iter);
|
|
|
|
|
if (ret != count)
|
|
|
|
|
ret = -EFAULT;
|
|
|
|
|
kfree(tmp);
|
|
|
|
|
out:
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct btrfs_encoded_read_private {
|
|
|
|
|
struct btrfs_inode *inode;
|
|
|
|
|
u64 file_offset;
|
|
|
|
|
wait_queue_head_t wait;
|
|
|
|
|
atomic_t pending;
|
|
|
|
|
blk_status_t status;
|
|
|
|
|
bool skip_csum;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
|
|
|
|
|
struct bio *bio, int mirror_num)
|
|
|
|
|
{
|
2022-08-06 10:03:26 +02:00
|
|
|
struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
blk_status_t ret;
|
|
|
|
|
|
|
|
|
|
if (!priv->skip_csum) {
|
|
|
|
|
ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
atomic_inc(&priv->pending);
|
2022-06-17 12:04:07 +02:00
|
|
|
btrfs_submit_bio(fs_info, bio, mirror_num);
|
|
|
|
|
return BLK_STS_OK;
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
|
|
|
|
|
{
|
|
|
|
|
const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
|
2022-08-06 10:03:26 +02:00
|
|
|
struct btrfs_encoded_read_private *priv = bbio->private;
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
struct btrfs_inode *inode = priv->inode;
|
|
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
u32 sectorsize = fs_info->sectorsize;
|
|
|
|
|
struct bio_vec *bvec;
|
|
|
|
|
struct bvec_iter_all iter_all;
|
|
|
|
|
u32 bio_offset = 0;
|
|
|
|
|
|
|
|
|
|
if (priv->skip_csum || !uptodate)
|
|
|
|
|
return bbio->bio.bi_status;
|
|
|
|
|
|
|
|
|
|
bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
|
|
|
|
|
unsigned int i, nr_sectors, pgoff;
|
|
|
|
|
|
|
|
|
|
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
|
|
|
|
|
pgoff = bvec->bv_offset;
|
|
|
|
|
for (i = 0; i < nr_sectors; i++) {
|
|
|
|
|
ASSERT(pgoff < PAGE_SIZE);
|
2022-07-07 07:33:29 +02:00
|
|
|
if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
|
|
|
|
|
bvec->bv_page, pgoff))
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
return BLK_STS_IOERR;
|
|
|
|
|
bio_offset += sectorsize;
|
|
|
|
|
pgoff += sectorsize;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return BLK_STS_OK;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-06 10:03:26 +02:00
|
|
|
static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
{
|
2022-08-06 10:03:26 +02:00
|
|
|
struct btrfs_encoded_read_private *priv = bbio->private;
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
blk_status_t status;
|
|
|
|
|
|
|
|
|
|
status = btrfs_encoded_read_verify_csum(bbio);
|
|
|
|
|
if (status) {
|
|
|
|
|
/*
|
|
|
|
|
* The memory barrier implied by the atomic_dec_return() here
|
|
|
|
|
* pairs with the memory barrier implied by the
|
|
|
|
|
* atomic_dec_return() or io_wait_event() in
|
|
|
|
|
* btrfs_encoded_read_regular_fill_pages() to ensure that this
|
|
|
|
|
* write is observed before the load of status in
|
|
|
|
|
* btrfs_encoded_read_regular_fill_pages().
|
|
|
|
|
*/
|
|
|
|
|
WRITE_ONCE(priv->status, status);
|
|
|
|
|
}
|
|
|
|
|
if (!atomic_dec_return(&priv->pending))
|
|
|
|
|
wake_up(&priv->wait);
|
|
|
|
|
btrfs_bio_free_csum(bbio);
|
2022-08-06 10:03:26 +02:00
|
|
|
bio_put(&bbio->bio);
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
}
|
|
|
|
|
|
2022-03-17 10:25:42 -07:00
|
|
|
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
|
|
|
|
u64 file_offset, u64 disk_bytenr,
|
|
|
|
|
u64 disk_io_size, struct page **pages)
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
struct btrfs_encoded_read_private priv = {
|
|
|
|
|
.inode = inode,
|
|
|
|
|
.file_offset = file_offset,
|
|
|
|
|
.pending = ATOMIC_INIT(1),
|
|
|
|
|
.skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
|
|
|
|
|
};
|
|
|
|
|
unsigned long i = 0;
|
|
|
|
|
u64 cur = 0;
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
init_waitqueue_head(&priv.wait);
|
|
|
|
|
/*
|
|
|
|
|
* Submit bios for the extent, splitting due to bio or stripe limits as
|
|
|
|
|
* necessary.
|
|
|
|
|
*/
|
|
|
|
|
while (cur < disk_io_size) {
|
|
|
|
|
struct extent_map *em;
|
|
|
|
|
struct btrfs_io_geometry geom;
|
|
|
|
|
struct bio *bio = NULL;
|
|
|
|
|
u64 remaining;
|
|
|
|
|
|
|
|
|
|
em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
|
|
|
|
|
disk_io_size - cur);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
} else {
|
|
|
|
|
ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
|
|
|
|
|
disk_bytenr + cur, &geom);
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
}
|
|
|
|
|
if (ret) {
|
|
|
|
|
WRITE_ONCE(priv.status, errno_to_blk_status(ret));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
remaining = min(geom.len, disk_io_size - cur);
|
|
|
|
|
while (bio || remaining) {
|
|
|
|
|
size_t bytes = min_t(u64, remaining, PAGE_SIZE);
|
|
|
|
|
|
|
|
|
|
if (!bio) {
|
2022-08-06 10:03:26 +02:00
|
|
|
bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
|
|
|
|
|
btrfs_encoded_read_endio,
|
|
|
|
|
&priv);
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
bio->bi_iter.bi_sector =
|
|
|
|
|
(disk_bytenr + cur) >> SECTOR_SHIFT;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!bytes ||
|
|
|
|
|
bio_add_page(bio, pages[i], bytes, 0) < bytes) {
|
|
|
|
|
blk_status_t status;
|
|
|
|
|
|
|
|
|
|
status = submit_encoded_read_bio(inode, bio, 0);
|
|
|
|
|
if (status) {
|
|
|
|
|
WRITE_ONCE(priv.status, status);
|
|
|
|
|
bio_put(bio);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
bio = NULL;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
i++;
|
|
|
|
|
cur += bytes;
|
|
|
|
|
remaining -= bytes;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
if (atomic_dec_return(&priv.pending))
|
|
|
|
|
io_wait_event(priv.wait, !atomic_read(&priv.pending));
|
|
|
|
|
/* See btrfs_encoded_read_endio() for ordering. */
|
|
|
|
|
return blk_status_to_errno(READ_ONCE(priv.status));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
|
|
|
|
|
struct iov_iter *iter,
|
|
|
|
|
u64 start, u64 lockend,
|
|
|
|
|
struct extent_state **cached_state,
|
|
|
|
|
u64 disk_bytenr, u64 disk_io_size,
|
|
|
|
|
size_t count, bool compressed,
|
|
|
|
|
bool *unlocked)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
|
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
|
|
|
|
struct page **pages;
|
|
|
|
|
unsigned long nr_pages, i;
|
|
|
|
|
u64 cur;
|
|
|
|
|
size_t page_offset;
|
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
|
|
nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
|
|
|
|
|
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
|
|
|
|
|
if (!pages)
|
|
|
|
|
return -ENOMEM;
|
2022-03-30 16:11:22 -04:00
|
|
|
ret = btrfs_alloc_page_array(nr_pages, pages);
|
|
|
|
|
if (ret) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
|
|
|
|
|
disk_io_size, pages);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
unlock_extent_cached(io_tree, start, lockend, cached_state);
|
|
|
|
|
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
|
|
|
|
|
*unlocked = true;
|
|
|
|
|
|
|
|
|
|
if (compressed) {
|
|
|
|
|
i = 0;
|
|
|
|
|
page_offset = 0;
|
|
|
|
|
} else {
|
|
|
|
|
i = (iocb->ki_pos - start) >> PAGE_SHIFT;
|
|
|
|
|
page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
|
|
|
|
|
}
|
|
|
|
|
cur = 0;
|
|
|
|
|
while (cur < count) {
|
|
|
|
|
size_t bytes = min_t(size_t, count - cur,
|
|
|
|
|
PAGE_SIZE - page_offset);
|
|
|
|
|
|
|
|
|
|
if (copy_page_to_iter(pages[i], page_offset, bytes,
|
|
|
|
|
iter) != bytes) {
|
|
|
|
|
ret = -EFAULT;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
i++;
|
|
|
|
|
cur += bytes;
|
|
|
|
|
page_offset = 0;
|
|
|
|
|
}
|
|
|
|
|
ret = count;
|
|
|
|
|
out:
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
|
if (pages[i])
|
|
|
|
|
__free_page(pages[i]);
|
|
|
|
|
}
|
|
|
|
|
kfree(pages);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
|
|
|
|
|
struct btrfs_ioctl_encoded_io_args *encoded)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
|
|
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
|
|
|
|
ssize_t ret;
|
|
|
|
|
size_t count = iov_iter_count(iter);
|
|
|
|
|
u64 start, lockend, disk_bytenr, disk_io_size;
|
|
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
|
struct extent_map *em;
|
|
|
|
|
bool unlocked = false;
|
|
|
|
|
|
|
|
|
|
file_accessed(iocb->ki_filp);
|
|
|
|
|
|
|
|
|
|
btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
|
|
|
|
|
|
|
|
|
|
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
|
|
|
|
|
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
|
|
|
|
|
/*
|
|
|
|
|
* We don't know how long the extent containing iocb->ki_pos is, but if
|
|
|
|
|
* it's compressed we know that it won't be longer than this.
|
|
|
|
|
*/
|
|
|
|
|
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
|
|
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
|
|
|
|
|
lockend - start + 1);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_unlock_inode;
|
|
|
|
|
lock_extent_bits(io_tree, start, lockend, &cached_state);
|
|
|
|
|
ordered = btrfs_lookup_ordered_range(inode, start,
|
|
|
|
|
lockend - start + 1);
|
|
|
|
|
if (!ordered)
|
|
|
|
|
break;
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
unlock_extent_cached(io_tree, start, lockend, &cached_state);
|
|
|
|
|
cond_resched();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto out_unlock_extent;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (em->block_start == EXTENT_MAP_INLINE) {
|
|
|
|
|
u64 extent_start = em->start;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* For inline extents we get everything we need out of the
|
|
|
|
|
* extent item.
|
|
|
|
|
*/
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
em = NULL;
|
|
|
|
|
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
|
|
|
|
|
&cached_state, extent_start,
|
|
|
|
|
count, encoded, &unlocked);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We only want to return up to EOF even if the extent extends beyond
|
|
|
|
|
* that.
|
|
|
|
|
*/
|
|
|
|
|
encoded->len = min_t(u64, extent_map_end(em),
|
|
|
|
|
inode->vfs_inode.i_size) - iocb->ki_pos;
|
|
|
|
|
if (em->block_start == EXTENT_MAP_HOLE ||
|
|
|
|
|
test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
|
|
|
|
|
disk_bytenr = EXTENT_MAP_HOLE;
|
|
|
|
|
count = min_t(u64, count, encoded->len);
|
|
|
|
|
encoded->len = count;
|
|
|
|
|
encoded->unencoded_len = count;
|
|
|
|
|
} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
|
|
|
|
|
disk_bytenr = em->block_start;
|
|
|
|
|
/*
|
|
|
|
|
* Bail if the buffer isn't large enough to return the whole
|
|
|
|
|
* compressed extent.
|
|
|
|
|
*/
|
|
|
|
|
if (em->block_len > count) {
|
|
|
|
|
ret = -ENOBUFS;
|
|
|
|
|
goto out_em;
|
|
|
|
|
}
|
2022-06-21 18:40:48 +02:00
|
|
|
disk_io_size = em->block_len;
|
|
|
|
|
count = em->block_len;
|
btrfs: add BTRFS_IOC_ENCODED_READ ioctl
There are 4 main cases:
1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
from disk.
4. Regular, compressed extents: we read the entire compressed extent
from disk and indicate what subset of the decompressed extent is in
the file.
This initial implementation simplifies a few things that can be improved
in the future:
- Cases 1, 3, and 4 allocate temporary memory to read into before
copying out to userspace.
- We don't do read repair, because it turns out that read repair is
currently broken for compressed data.
- We hold the inode lock during the operation.
Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:
btrfs_page_mkwrite btrfs_encoded_read
---------------------------------------------------
(enter) (enter)
btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
lock_extent_bits
read extent (dirty page hasn't been flushed,
so this is the old data)
unlock_extent_cached
(exit)
we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:
btrfs_page_mkwrite btrfs_encoded_read
-------------------------------------------------------------------
(enter) (enter)
btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
btrfs_wait_ordered_range
lock_extent_bits
read extent (page hasn't been dirtied,
so this is the old data)
unlock_extent_cached
btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-09 17:59:07 -07:00
|
|
|
encoded->unencoded_len = em->ram_bytes;
|
|
|
|
|
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
|
|
|
|
|
ret = btrfs_encoded_io_compression_from_extent(fs_info,
|
|
|
|
|
em->compress_type);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
goto out_em;
|
|
|
|
|
encoded->compression = ret;
|
|
|
|
|
} else {
|
|
|
|
|
disk_bytenr = em->block_start + (start - em->start);
|
|
|
|
|
if (encoded->len > count)
|
|
|
|
|
encoded->len = count;
|
|
|
|
|
/*
|
|
|
|
|
* Don't read beyond what we locked. This also limits the page
|
|
|
|
|
* allocations that we'll do.
|
|
|
|
|
*/
|
|
|
|
|
disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
|
|
|
|
|
count = start + disk_io_size - iocb->ki_pos;
|
|
|
|
|
encoded->len = count;
|
|
|
|
|
encoded->unencoded_len = count;
|
|
|
|
|
disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
|
|
|
|
|
}
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
em = NULL;
|
|
|
|
|
|
|
|
|
|
if (disk_bytenr == EXTENT_MAP_HOLE) {
|
|
|
|
|
unlock_extent_cached(io_tree, start, lockend, &cached_state);
|
|
|
|
|
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
|
|
|
|
|
unlocked = true;
|
|
|
|
|
ret = iov_iter_zero(count, iter);
|
|
|
|
|
if (ret != count)
|
|
|
|
|
ret = -EFAULT;
|
|
|
|
|
} else {
|
|
|
|
|
ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
|
|
|
|
|
&cached_state, disk_bytenr,
|
|
|
|
|
disk_io_size, count,
|
|
|
|
|
encoded->compression,
|
|
|
|
|
&unlocked);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
if (ret >= 0)
|
|
|
|
|
iocb->ki_pos += encoded->len;
|
|
|
|
|
out_em:
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
out_unlock_extent:
|
|
|
|
|
if (!unlocked)
|
|
|
|
|
unlock_extent_cached(io_tree, start, lockend, &cached_state);
|
|
|
|
|
out_unlock_inode:
|
|
|
|
|
if (!unlocked)
|
|
|
|
|
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-13 16:00:02 -07:00
|
|
|
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
|
|
|
|
|
const struct btrfs_ioctl_encoded_io_args *encoded)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
|
|
|
|
struct extent_changeset *data_reserved = NULL;
|
|
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
|
int compression;
|
|
|
|
|
size_t orig_count;
|
|
|
|
|
u64 start, end;
|
|
|
|
|
u64 num_bytes, ram_bytes, disk_num_bytes;
|
|
|
|
|
unsigned long nr_pages, i;
|
|
|
|
|
struct page **pages;
|
|
|
|
|
struct btrfs_key ins;
|
|
|
|
|
bool extent_reserved = false;
|
|
|
|
|
struct extent_map *em;
|
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
|
|
switch (encoded->compression) {
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
|
|
|
|
|
compression = BTRFS_COMPRESS_ZLIB;
|
|
|
|
|
break;
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
|
|
|
|
|
compression = BTRFS_COMPRESS_ZSTD;
|
|
|
|
|
break;
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
|
|
|
|
|
case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
|
|
|
|
|
/* The sector size must match for LZO. */
|
|
|
|
|
if (encoded->compression -
|
|
|
|
|
BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
|
|
|
|
|
fs_info->sectorsize_bits)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
compression = BTRFS_COMPRESS_LZO;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
orig_count = iov_iter_count(from);
|
|
|
|
|
|
|
|
|
|
/* The extent size must be sane. */
|
|
|
|
|
if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
|
|
|
|
|
orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The compressed data must be smaller than the decompressed data.
|
|
|
|
|
*
|
|
|
|
|
* It's of course possible for data to compress to larger or the same
|
|
|
|
|
* size, but the buffered I/O path falls back to no compression for such
|
|
|
|
|
* data, and we don't want to break any assumptions by creating these
|
|
|
|
|
* extents.
|
|
|
|
|
*
|
|
|
|
|
* Note that this is less strict than the current check we have that the
|
|
|
|
|
* compressed data must be at least one sector smaller than the
|
|
|
|
|
* decompressed data. We only want to enforce the weaker requirement
|
|
|
|
|
* from old kernels that it is at least one byte smaller.
|
|
|
|
|
*/
|
|
|
|
|
if (orig_count >= encoded->unencoded_len)
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/* The extent must start on a sector boundary. */
|
|
|
|
|
start = iocb->ki_pos;
|
|
|
|
|
if (!IS_ALIGNED(start, fs_info->sectorsize))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The extent must end on a sector boundary. However, we allow a write
|
|
|
|
|
* which ends at or extends i_size to have an unaligned length; we round
|
|
|
|
|
* up the extent size and set i_size to the unaligned end.
|
|
|
|
|
*/
|
|
|
|
|
if (start + encoded->len < inode->vfs_inode.i_size &&
|
|
|
|
|
!IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
/* Finally, the offset in the unencoded data must be sector-aligned. */
|
|
|
|
|
if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
|
|
num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
|
|
|
|
|
ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
|
|
|
|
|
end = start + num_bytes - 1;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the extent cannot be inline, the compressed data on disk must be
|
|
|
|
|
* sector-aligned. For convenience, we extend it with zeroes if it
|
|
|
|
|
* isn't.
|
|
|
|
|
*/
|
|
|
|
|
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
|
|
|
|
|
nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
|
|
|
|
|
pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
|
|
|
|
|
if (!pages)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
|
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
|
|
|
|
|
char *kaddr;
|
|
|
|
|
|
|
|
|
|
pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
|
|
|
|
|
if (!pages[i]) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out_pages;
|
|
|
|
|
}
|
2022-05-31 16:53:33 +02:00
|
|
|
kaddr = kmap_local_page(pages[i]);
|
2019-08-13 16:00:02 -07:00
|
|
|
if (copy_from_iter(kaddr, bytes, from) != bytes) {
|
2022-05-31 16:53:33 +02:00
|
|
|
kunmap_local(kaddr);
|
2019-08-13 16:00:02 -07:00
|
|
|
ret = -EFAULT;
|
|
|
|
|
goto out_pages;
|
|
|
|
|
}
|
|
|
|
|
if (bytes < PAGE_SIZE)
|
|
|
|
|
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
|
2022-05-31 16:53:33 +02:00
|
|
|
kunmap_local(kaddr);
|
2019-08-13 16:00:02 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
|
|
|
|
|
|
ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_pages;
|
|
|
|
|
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
|
|
|
|
|
start >> PAGE_SHIFT,
|
|
|
|
|
end >> PAGE_SHIFT);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_pages;
|
|
|
|
|
lock_extent_bits(io_tree, start, end, &cached_state);
|
|
|
|
|
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
|
|
|
|
|
if (!ordered &&
|
|
|
|
|
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
|
|
|
|
|
break;
|
|
|
|
|
if (ordered)
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
unlock_extent_cached(io_tree, start, end, &cached_state);
|
|
|
|
|
cond_resched();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We don't use the higher-level delalloc space functions because our
|
|
|
|
|
* num_bytes and disk_num_bytes are different.
|
|
|
|
|
*/
|
|
|
|
|
ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_unlock;
|
|
|
|
|
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_free_data_space;
|
btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.
So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.
This is part of a patchset comprised of the following patches:
btrfs: avoid blocking on page locks with nowait dio on compressed range
btrfs: avoid blocking nowait dio when locking file range
btrfs: avoid double nocow check when doing nowait dio writes
btrfs: stop allocating a path when checking if cross reference exists
btrfs: free path at can_nocow_extent() before checking for checksum items
btrfs: release path earlier at can_nocow_extent()
btrfs: avoid blocking when allocating context for nowait dio read/write
btrfs: avoid blocking on space revervation when doing nowait dio writes
The following test was run before and after applying this patchset:
$ cat io-uring-nodatacow-test.sh
#!/bin/bash
DEV=/dev/sdc
MNT=/mnt/sdc
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
NUM_JOBS=4
FILE_SIZE=8G
RUN_TIME=300
cat <<EOF > /tmp/fio-job.ini
[io_uring_rw]
rw=randrw
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
filesize=$FILE_SIZE
runtime=$RUN_TIME
time_based
filename=foobar
directory=$MNT
numjobs=$NUM_JOBS
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.
Result before the patchset:
READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
Result after the patchset:
READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec
That's about +7.2% throughput for reads and +6.9% for writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-03-23 16:19:30 +00:00
|
|
|
ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
|
|
|
|
|
false);
|
2019-08-13 16:00:02 -07:00
|
|
|
if (ret)
|
|
|
|
|
goto out_qgroup_free_data;
|
|
|
|
|
|
|
|
|
|
/* Try an inline extent first. */
|
|
|
|
|
if (start == 0 && encoded->unencoded_len == encoded->len &&
|
|
|
|
|
encoded->unencoded_offset == 0) {
|
|
|
|
|
ret = cow_file_range_inline(inode, encoded->len, orig_count,
|
|
|
|
|
compression, pages, true);
|
|
|
|
|
if (ret <= 0) {
|
|
|
|
|
if (ret == 0)
|
|
|
|
|
ret = orig_count;
|
|
|
|
|
goto out_delalloc_release;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
|
|
|
|
|
disk_num_bytes, 0, 0, &ins, 1, 1);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_delalloc_release;
|
|
|
|
|
extent_reserved = true;
|
|
|
|
|
|
|
|
|
|
em = create_io_em(inode, start, num_bytes,
|
|
|
|
|
start - encoded->unencoded_offset, ins.objectid,
|
|
|
|
|
ins.offset, ins.offset, ram_bytes, compression,
|
|
|
|
|
BTRFS_ORDERED_COMPRESSED);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto out_free_reserved;
|
|
|
|
|
}
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
|
|
|
|
|
ins.objectid, ins.offset,
|
|
|
|
|
encoded->unencoded_offset,
|
|
|
|
|
(1 << BTRFS_ORDERED_ENCODED) |
|
|
|
|
|
(1 << BTRFS_ORDERED_COMPRESSED),
|
|
|
|
|
compression);
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_drop_extent_cache(inode, start, end, 0);
|
|
|
|
|
goto out_free_reserved;
|
|
|
|
|
}
|
|
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
|
|
|
|
|
|
|
|
|
if (start + encoded->len > inode->vfs_inode.i_size)
|
|
|
|
|
i_size_write(&inode->vfs_inode, start + encoded->len);
|
|
|
|
|
|
|
|
|
|
unlock_extent_cached(io_tree, start, end, &cached_state);
|
|
|
|
|
|
|
|
|
|
btrfs_delalloc_release_extents(inode, num_bytes);
|
|
|
|
|
|
|
|
|
|
if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
|
|
|
|
|
ins.offset, pages, nr_pages, 0, NULL,
|
|
|
|
|
false)) {
|
|
|
|
|
btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
|
|
|
|
|
ret = -EIO;
|
|
|
|
|
goto out_pages;
|
|
|
|
|
}
|
|
|
|
|
ret = orig_count;
|
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
|
|
out_free_reserved:
|
|
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
|
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
|
|
|
|
|
out_delalloc_release:
|
|
|
|
|
btrfs_delalloc_release_extents(inode, num_bytes);
|
|
|
|
|
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
|
|
|
|
|
out_qgroup_free_data:
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
|
|
|
|
|
out_free_data_space:
|
|
|
|
|
/*
|
|
|
|
|
* If btrfs_reserve_extent() succeeded, then we already decremented
|
|
|
|
|
* bytes_may_use.
|
|
|
|
|
*/
|
|
|
|
|
if (!extent_reserved)
|
|
|
|
|
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
|
|
|
|
|
out_unlock:
|
|
|
|
|
unlock_extent_cached(io_tree, start, end, &cached_state);
|
|
|
|
|
out_pages:
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
|
if (pages[i])
|
|
|
|
|
__free_page(pages[i]);
|
|
|
|
|
}
|
|
|
|
|
kvfree(pages);
|
|
|
|
|
out:
|
|
|
|
|
if (ret >= 0)
|
|
|
|
|
iocb->ki_pos += encoded->len;
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-03 10:28:14 -07:00
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
|
/*
|
|
|
|
|
* Add an entry indicating a block group or device which is pinned by a
|
|
|
|
|
* swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
|
|
|
|
|
* negative errno on failure.
|
|
|
|
|
*/
|
|
|
|
|
static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
|
|
|
|
|
bool is_block_group)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
|
|
|
|
struct btrfs_swapfile_pin *sp, *entry;
|
|
|
|
|
struct rb_node **p;
|
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
|
|
|
|
|
|
sp = kmalloc(sizeof(*sp), GFP_NOFS);
|
|
|
|
|
if (!sp)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
sp->ptr = ptr;
|
|
|
|
|
sp->inode = inode;
|
|
|
|
|
sp->is_block_group = is_block_group;
|
btrfs: fix race between writes to swap files and scrub
When we active a swap file, at btrfs_swap_activate(), we acquire the
exclusive operation lock to prevent the physical location of the swap
file extents to be changed by operations such as balance and device
replace/resize/remove. We also call there can_nocow_extent() which,
among other things, checks if the block group of a swap file extent is
currently RO, and if it is we can not use the extent, since a write
into it would result in COWing the extent.
However we have no protection against a scrub operation running after we
activate the swap file, which can result in the swap file extents to be
COWed while the scrub is running and operating on the respective block
group, because scrub turns a block group into RO before it processes it
and then back again to RW mode after processing it. That means an attempt
to write into a swap file extent while scrub is processing the respective
block group, will result in COWing the extent, changing its physical
location on disk.
Fix this by making sure that block groups that have extents that are used
by active swap files can not be turned into RO mode, therefore making it
not possible for a scrub to turn them into RO mode. When a scrub finds a
block group that can not be turned to RO due to the existence of extents
used by swap files, it proceeds to the next block group and logs a warning
message that mentions the block group was skipped due to active swap
files - this is the same approach we currently use for balance.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:37 +00:00
|
|
|
sp->bg_extent_count = 1;
|
2016-11-03 10:28:14 -07:00
|
|
|
|
|
|
|
|
spin_lock(&fs_info->swapfile_pins_lock);
|
|
|
|
|
p = &fs_info->swapfile_pins.rb_node;
|
|
|
|
|
while (*p) {
|
|
|
|
|
parent = *p;
|
|
|
|
|
entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
|
|
|
|
|
if (sp->ptr < entry->ptr ||
|
|
|
|
|
(sp->ptr == entry->ptr && sp->inode < entry->inode)) {
|
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
|
} else if (sp->ptr > entry->ptr ||
|
|
|
|
|
(sp->ptr == entry->ptr && sp->inode > entry->inode)) {
|
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
|
} else {
|
btrfs: fix race between writes to swap files and scrub
When we active a swap file, at btrfs_swap_activate(), we acquire the
exclusive operation lock to prevent the physical location of the swap
file extents to be changed by operations such as balance and device
replace/resize/remove. We also call there can_nocow_extent() which,
among other things, checks if the block group of a swap file extent is
currently RO, and if it is we can not use the extent, since a write
into it would result in COWing the extent.
However we have no protection against a scrub operation running after we
activate the swap file, which can result in the swap file extents to be
COWed while the scrub is running and operating on the respective block
group, because scrub turns a block group into RO before it processes it
and then back again to RW mode after processing it. That means an attempt
to write into a swap file extent while scrub is processing the respective
block group, will result in COWing the extent, changing its physical
location on disk.
Fix this by making sure that block groups that have extents that are used
by active swap files can not be turned into RO mode, therefore making it
not possible for a scrub to turn them into RO mode. When a scrub finds a
block group that can not be turned to RO due to the existence of extents
used by swap files, it proceeds to the next block group and logs a warning
message that mentions the block group was skipped due to active swap
files - this is the same approach we currently use for balance.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:37 +00:00
|
|
|
if (is_block_group)
|
|
|
|
|
entry->bg_extent_count++;
|
2016-11-03 10:28:14 -07:00
|
|
|
spin_unlock(&fs_info->swapfile_pins_lock);
|
|
|
|
|
kfree(sp);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
rb_link_node(&sp->node, parent, p);
|
|
|
|
|
rb_insert_color(&sp->node, &fs_info->swapfile_pins);
|
|
|
|
|
spin_unlock(&fs_info->swapfile_pins_lock);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Free all of the entries pinned by this swapfile. */
|
|
|
|
|
static void btrfs_free_swapfile_pins(struct inode *inode)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
|
|
|
|
struct btrfs_swapfile_pin *sp;
|
|
|
|
|
struct rb_node *node, *next;
|
|
|
|
|
|
|
|
|
|
spin_lock(&fs_info->swapfile_pins_lock);
|
|
|
|
|
node = rb_first(&fs_info->swapfile_pins);
|
|
|
|
|
while (node) {
|
|
|
|
|
next = rb_next(node);
|
|
|
|
|
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
|
|
|
|
|
if (sp->inode == inode) {
|
|
|
|
|
rb_erase(&sp->node, &fs_info->swapfile_pins);
|
btrfs: fix race between writes to swap files and scrub
When we active a swap file, at btrfs_swap_activate(), we acquire the
exclusive operation lock to prevent the physical location of the swap
file extents to be changed by operations such as balance and device
replace/resize/remove. We also call there can_nocow_extent() which,
among other things, checks if the block group of a swap file extent is
currently RO, and if it is we can not use the extent, since a write
into it would result in COWing the extent.
However we have no protection against a scrub operation running after we
activate the swap file, which can result in the swap file extents to be
COWed while the scrub is running and operating on the respective block
group, because scrub turns a block group into RO before it processes it
and then back again to RW mode after processing it. That means an attempt
to write into a swap file extent while scrub is processing the respective
block group, will result in COWing the extent, changing its physical
location on disk.
Fix this by making sure that block groups that have extents that are used
by active swap files can not be turned into RO mode, therefore making it
not possible for a scrub to turn them into RO mode. When a scrub finds a
block group that can not be turned to RO due to the existence of extents
used by swap files, it proceeds to the next block group and logs a warning
message that mentions the block group was skipped due to active swap
files - this is the same approach we currently use for balance.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:37 +00:00
|
|
|
if (sp->is_block_group) {
|
|
|
|
|
btrfs_dec_block_group_swap_extents(sp->ptr,
|
|
|
|
|
sp->bg_extent_count);
|
2016-11-03 10:28:14 -07:00
|
|
|
btrfs_put_block_group(sp->ptr);
|
btrfs: fix race between writes to swap files and scrub
When we active a swap file, at btrfs_swap_activate(), we acquire the
exclusive operation lock to prevent the physical location of the swap
file extents to be changed by operations such as balance and device
replace/resize/remove. We also call there can_nocow_extent() which,
among other things, checks if the block group of a swap file extent is
currently RO, and if it is we can not use the extent, since a write
into it would result in COWing the extent.
However we have no protection against a scrub operation running after we
activate the swap file, which can result in the swap file extents to be
COWed while the scrub is running and operating on the respective block
group, because scrub turns a block group into RO before it processes it
and then back again to RW mode after processing it. That means an attempt
to write into a swap file extent while scrub is processing the respective
block group, will result in COWing the extent, changing its physical
location on disk.
Fix this by making sure that block groups that have extents that are used
by active swap files can not be turned into RO mode, therefore making it
not possible for a scrub to turn them into RO mode. When a scrub finds a
block group that can not be turned to RO due to the existence of extents
used by swap files, it proceeds to the next block group and logs a warning
message that mentions the block group was skipped due to active swap
files - this is the same approach we currently use for balance.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:37 +00:00
|
|
|
}
|
2016-11-03 10:28:14 -07:00
|
|
|
kfree(sp);
|
|
|
|
|
}
|
|
|
|
|
node = next;
|
|
|
|
|
}
|
|
|
|
|
spin_unlock(&fs_info->swapfile_pins_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct btrfs_swap_info {
|
|
|
|
|
u64 start;
|
|
|
|
|
u64 block_start;
|
|
|
|
|
u64 block_len;
|
|
|
|
|
u64 lowest_ppage;
|
|
|
|
|
u64 highest_ppage;
|
|
|
|
|
unsigned long nr_pages;
|
|
|
|
|
int nr_extents;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static int btrfs_add_swap_extent(struct swap_info_struct *sis,
|
|
|
|
|
struct btrfs_swap_info *bsi)
|
|
|
|
|
{
|
|
|
|
|
unsigned long nr_pages;
|
2021-12-16 15:00:32 +00:00
|
|
|
unsigned long max_pages;
|
2016-11-03 10:28:14 -07:00
|
|
|
u64 first_ppage, first_ppage_reported, next_ppage;
|
|
|
|
|
int ret;
|
|
|
|
|
|
2021-12-16 15:00:32 +00:00
|
|
|
/*
|
|
|
|
|
* Our swapfile may have had its size extended after the swap header was
|
|
|
|
|
* written. In that case activating the swapfile should not go beyond
|
|
|
|
|
* the max size set in the swap header.
|
|
|
|
|
*/
|
|
|
|
|
if (bsi->nr_pages >= sis->max)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
max_pages = sis->max - bsi->nr_pages;
|
2016-11-03 10:28:14 -07:00
|
|
|
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
|
|
|
|
|
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
|
|
|
|
|
PAGE_SIZE) >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
|
|
if (first_ppage >= next_ppage)
|
|
|
|
|
return 0;
|
|
|
|
|
nr_pages = next_ppage - first_ppage;
|
2021-12-16 15:00:32 +00:00
|
|
|
nr_pages = min(nr_pages, max_pages);
|
2016-11-03 10:28:14 -07:00
|
|
|
|
|
|
|
|
first_ppage_reported = first_ppage;
|
|
|
|
|
if (bsi->start == 0)
|
|
|
|
|
first_ppage_reported++;
|
|
|
|
|
if (bsi->lowest_ppage > first_ppage_reported)
|
|
|
|
|
bsi->lowest_ppage = first_ppage_reported;
|
|
|
|
|
if (bsi->highest_ppage < (next_ppage - 1))
|
|
|
|
|
bsi->highest_ppage = next_ppage - 1;
|
|
|
|
|
|
|
|
|
|
ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
return ret;
|
|
|
|
|
bsi->nr_extents += ret;
|
|
|
|
|
bsi->nr_pages += nr_pages;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void btrfs_swap_deactivate(struct file *file)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode = file_inode(file);
|
|
|
|
|
|
|
|
|
|
btrfs_free_swapfile_pins(inode);
|
|
|
|
|
atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
|
|
|
|
sector_t *span)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode = file_inode(file);
|
btrfs: fix race between swap file activation and snapshot creation
When creating a snapshot we check if the current number of swap files, in
the root, is non-zero, and if it is, we error out and warn that we can not
create the snapshot because there are active swap files.
However this is racy because when a task started activation of a swap
file, another task might have started already snapshot creation and might
have seen the counter for the number of swap files as zero. This means
that after the swap file is activated we may end up with a snapshot of the
same root successfully created, and therefore when the first write to the
swap file happens it has to fall back into COW mode, which should never
happen for active swap files.
Basically what can happen is:
1) Task A starts snapshot creation and enters ioctl.c:create_snapshot().
There it sees that root->nr_swapfiles has a value of 0 so it continues;
2) Task B enters btrfs_swap_activate(). It is not aware that another task
started snapshot creation but it did not finish yet. It increments
root->nr_swapfiles from 0 to 1;
3) Task B checks that the file meets all requirements to be an active
swap file - it has NOCOW set, there are no snapshots for the inode's
root at the moment, no file holes, no reflinked extents, etc;
4) Task B returns success and now the file is an active swap file;
5) Task A commits the transaction to create the snapshot and finishes.
The swap file's extents are now shared between the original root and
the snapshot;
6) A write into an extent of the swap file is attempted - there is a
snapshot of the file's root, so we fall back to COW mode and therefore
the physical location of the extent changes on disk.
So fix this by taking the snapshot lock during swap file activation before
locking the extent range, as that is the order in which we lock these
during buffered writes.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:38 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2016-11-03 10:28:14 -07:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
|
struct extent_map *em = NULL;
|
|
|
|
|
struct btrfs_device *device = NULL;
|
|
|
|
|
struct btrfs_swap_info bsi = {
|
|
|
|
|
.lowest_ppage = (sector_t)-1ULL,
|
|
|
|
|
};
|
|
|
|
|
int ret = 0;
|
|
|
|
|
u64 isize;
|
|
|
|
|
u64 start;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the swap file was just created, make sure delalloc is done. If the
|
|
|
|
|
* file changes again after this, the user is doing something stupid and
|
|
|
|
|
* we don't really care.
|
|
|
|
|
*/
|
|
|
|
|
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The inode is locked, so these flags won't change after we check them.
|
|
|
|
|
*/
|
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must not be compressed");
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must not be checksummed");
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Balance or device remove/replace/resize can move stuff around from
|
2020-08-25 10:02:32 -05:00
|
|
|
* under us. The exclop protection makes sure they aren't running/won't
|
|
|
|
|
* run concurrently while we are mapping the swap extents, and
|
|
|
|
|
* fs_info->swapfile_pins prevents them from running while the swap
|
|
|
|
|
* file is active and moving the extents. Note that this also prevents
|
|
|
|
|
* a concurrent device add which isn't actually necessary, but it's not
|
2016-11-03 10:28:14 -07:00
|
|
|
* really worth the trouble to allow it.
|
|
|
|
|
*/
|
2020-08-25 10:02:32 -05:00
|
|
|
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
|
2016-11-03 10:28:14 -07:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"cannot activate swapfile while exclusive operation is running");
|
|
|
|
|
return -EBUSY;
|
|
|
|
|
}
|
btrfs: fix race between swap file activation and snapshot creation
When creating a snapshot we check if the current number of swap files, in
the root, is non-zero, and if it is, we error out and warn that we can not
create the snapshot because there are active swap files.
However this is racy because when a task started activation of a swap
file, another task might have started already snapshot creation and might
have seen the counter for the number of swap files as zero. This means
that after the swap file is activated we may end up with a snapshot of the
same root successfully created, and therefore when the first write to the
swap file happens it has to fall back into COW mode, which should never
happen for active swap files.
Basically what can happen is:
1) Task A starts snapshot creation and enters ioctl.c:create_snapshot().
There it sees that root->nr_swapfiles has a value of 0 so it continues;
2) Task B enters btrfs_swap_activate(). It is not aware that another task
started snapshot creation but it did not finish yet. It increments
root->nr_swapfiles from 0 to 1;
3) Task B checks that the file meets all requirements to be an active
swap file - it has NOCOW set, there are no snapshots for the inode's
root at the moment, no file holes, no reflinked extents, etc;
4) Task B returns success and now the file is an active swap file;
5) Task A commits the transaction to create the snapshot and finishes.
The swap file's extents are now shared between the original root and
the snapshot;
6) A write into an extent of the swap file is attempted - there is a
snapshot of the file's root, so we fall back to COW mode and therefore
the physical location of the extent changes on disk.
So fix this by taking the snapshot lock during swap file activation before
locking the extent range, as that is the order in which we lock these
during buffered writes.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:38 +00:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Prevent snapshot creation while we are activating the swap file.
|
|
|
|
|
* We do not want to race with snapshot creation. If snapshot creation
|
|
|
|
|
* already started before we bumped nr_swapfiles from 0 to 1 and
|
|
|
|
|
* completes before the first write into the swap file after it is
|
|
|
|
|
* activated, than that write would fallback to COW.
|
|
|
|
|
*/
|
|
|
|
|
if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
|
|
|
|
|
btrfs_exclop_finish(fs_info);
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"cannot activate swapfile because snapshot creation is in progress");
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
2016-11-03 10:28:14 -07:00
|
|
|
/*
|
|
|
|
|
* Snapshots can create extents which require COW even if NODATACOW is
|
|
|
|
|
* set. We use this counter to prevent snapshots. We must increment it
|
|
|
|
|
* before walking the extents because we don't want a concurrent
|
|
|
|
|
* snapshot to run after we've already checked the extents.
|
2022-03-23 15:10:32 +08:00
|
|
|
*
|
|
|
|
|
* It is possible that subvolume is marked for deletion but still not
|
|
|
|
|
* removed yet. To prevent this race, we check the root status before
|
|
|
|
|
* activating the swapfile.
|
2016-11-03 10:28:14 -07:00
|
|
|
*/
|
2022-03-23 15:10:32 +08:00
|
|
|
spin_lock(&root->root_item_lock);
|
|
|
|
|
if (btrfs_root_dead(root)) {
|
|
|
|
|
spin_unlock(&root->root_item_lock);
|
|
|
|
|
|
|
|
|
|
btrfs_exclop_finish(fs_info);
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"cannot activate swapfile because subvolume %llu is being deleted",
|
|
|
|
|
root->root_key.objectid);
|
|
|
|
|
return -EPERM;
|
|
|
|
|
}
|
btrfs: fix race between swap file activation and snapshot creation
When creating a snapshot we check if the current number of swap files, in
the root, is non-zero, and if it is, we error out and warn that we can not
create the snapshot because there are active swap files.
However this is racy because when a task started activation of a swap
file, another task might have started already snapshot creation and might
have seen the counter for the number of swap files as zero. This means
that after the swap file is activated we may end up with a snapshot of the
same root successfully created, and therefore when the first write to the
swap file happens it has to fall back into COW mode, which should never
happen for active swap files.
Basically what can happen is:
1) Task A starts snapshot creation and enters ioctl.c:create_snapshot().
There it sees that root->nr_swapfiles has a value of 0 so it continues;
2) Task B enters btrfs_swap_activate(). It is not aware that another task
started snapshot creation but it did not finish yet. It increments
root->nr_swapfiles from 0 to 1;
3) Task B checks that the file meets all requirements to be an active
swap file - it has NOCOW set, there are no snapshots for the inode's
root at the moment, no file holes, no reflinked extents, etc;
4) Task B returns success and now the file is an active swap file;
5) Task A commits the transaction to create the snapshot and finishes.
The swap file's extents are now shared between the original root and
the snapshot;
6) A write into an extent of the swap file is attempted - there is a
snapshot of the file's root, so we fall back to COW mode and therefore
the physical location of the extent changes on disk.
So fix this by taking the snapshot lock during swap file activation before
locking the extent range, as that is the order in which we lock these
during buffered writes.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:38 +00:00
|
|
|
atomic_inc(&root->nr_swapfiles);
|
2022-03-23 15:10:32 +08:00
|
|
|
spin_unlock(&root->root_item_lock);
|
2016-11-03 10:28:14 -07:00
|
|
|
|
|
|
|
|
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
|
|
|
|
|
|
|
|
|
|
lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
|
|
|
|
|
start = 0;
|
|
|
|
|
while (start < isize) {
|
|
|
|
|
u64 logical_block_start, physical_block_start;
|
2019-10-29 19:20:18 +01:00
|
|
|
struct btrfs_block_group *bg;
|
2016-11-03 10:28:14 -07:00
|
|
|
u64 len = isize - start;
|
|
|
|
|
|
2019-12-02 17:34:23 -08:00
|
|
|
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
|
2016-11-03 10:28:14 -07:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (em->block_start == EXTENT_MAP_HOLE) {
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must not have holes");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
if (em->block_start == EXTENT_MAP_INLINE) {
|
|
|
|
|
/*
|
|
|
|
|
* It's unlikely we'll ever actually find ourselves
|
|
|
|
|
* here, as a file small enough to fit inline won't be
|
|
|
|
|
* big enough to store more than the swap header, but in
|
|
|
|
|
* case something changes in the future, let's catch it
|
|
|
|
|
* here rather than later.
|
|
|
|
|
*/
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must not be inline");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must not be compressed");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logical_block_start = em->block_start + (start - em->start);
|
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
em = NULL;
|
|
|
|
|
|
2020-08-18 11:00:05 -07:00
|
|
|
ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
|
2016-11-03 10:28:14 -07:00
|
|
|
if (ret < 0) {
|
|
|
|
|
goto out;
|
|
|
|
|
} else if (ret) {
|
|
|
|
|
ret = 0;
|
|
|
|
|
} else {
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"swapfile must not be copy-on-write");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
|
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"swapfile must have single data profile");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (device == NULL) {
|
|
|
|
|
device = em->map_lookup->stripes[0].dev;
|
|
|
|
|
ret = btrfs_add_swapfile_pin(inode, device, false);
|
|
|
|
|
if (ret == 1)
|
|
|
|
|
ret = 0;
|
|
|
|
|
else if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
} else if (device != em->map_lookup->stripes[0].dev) {
|
|
|
|
|
btrfs_warn(fs_info, "swapfile must be on one device");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
physical_block_start = (em->map_lookup->stripes[0].physical +
|
|
|
|
|
(logical_block_start - em->start));
|
|
|
|
|
len = min(len, em->len - (logical_block_start - em->start));
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
em = NULL;
|
|
|
|
|
|
|
|
|
|
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
|
|
|
|
|
if (!bg) {
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"could not find block group containing swapfile");
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
btrfs: fix race between writes to swap files and scrub
When we active a swap file, at btrfs_swap_activate(), we acquire the
exclusive operation lock to prevent the physical location of the swap
file extents to be changed by operations such as balance and device
replace/resize/remove. We also call there can_nocow_extent() which,
among other things, checks if the block group of a swap file extent is
currently RO, and if it is we can not use the extent, since a write
into it would result in COWing the extent.
However we have no protection against a scrub operation running after we
activate the swap file, which can result in the swap file extents to be
COWed while the scrub is running and operating on the respective block
group, because scrub turns a block group into RO before it processes it
and then back again to RW mode after processing it. That means an attempt
to write into a swap file extent while scrub is processing the respective
block group, will result in COWing the extent, changing its physical
location on disk.
Fix this by making sure that block groups that have extents that are used
by active swap files can not be turned into RO mode, therefore making it
not possible for a scrub to turn them into RO mode. When a scrub finds a
block group that can not be turned to RO due to the existence of extents
used by swap files, it proceeds to the next block group and logs a warning
message that mentions the block group was skipped due to active swap
files - this is the same approach we currently use for balance.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:37 +00:00
|
|
|
if (!btrfs_inc_block_group_swap_extents(bg)) {
|
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
|
"block group for swapfile at %llu is read-only%s",
|
|
|
|
|
bg->start,
|
|
|
|
|
atomic_read(&fs_info->scrubs_running) ?
|
|
|
|
|
" (scrub running)" : "");
|
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-03 10:28:14 -07:00
|
|
|
ret = btrfs_add_swapfile_pin(inode, bg, true);
|
|
|
|
|
if (ret) {
|
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
|
if (ret == 1)
|
|
|
|
|
ret = 0;
|
|
|
|
|
else
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (bsi.block_len &&
|
|
|
|
|
bsi.block_start + bsi.block_len == physical_block_start) {
|
|
|
|
|
bsi.block_len += len;
|
|
|
|
|
} else {
|
|
|
|
|
if (bsi.block_len) {
|
|
|
|
|
ret = btrfs_add_swap_extent(sis, &bsi);
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
bsi.start = start;
|
|
|
|
|
bsi.block_start = physical_block_start;
|
|
|
|
|
bsi.block_len = len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
start += len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (bsi.block_len)
|
|
|
|
|
ret = btrfs_add_swap_extent(sis, &bsi);
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
if (!IS_ERR_OR_NULL(em))
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
|
|
unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
|
|
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
|
btrfs_swap_deactivate(file);
|
|
|
|
|
|
btrfs: fix race between swap file activation and snapshot creation
When creating a snapshot we check if the current number of swap files, in
the root, is non-zero, and if it is, we error out and warn that we can not
create the snapshot because there are active swap files.
However this is racy because when a task started activation of a swap
file, another task might have started already snapshot creation and might
have seen the counter for the number of swap files as zero. This means
that after the swap file is activated we may end up with a snapshot of the
same root successfully created, and therefore when the first write to the
swap file happens it has to fall back into COW mode, which should never
happen for active swap files.
Basically what can happen is:
1) Task A starts snapshot creation and enters ioctl.c:create_snapshot().
There it sees that root->nr_swapfiles has a value of 0 so it continues;
2) Task B enters btrfs_swap_activate(). It is not aware that another task
started snapshot creation but it did not finish yet. It increments
root->nr_swapfiles from 0 to 1;
3) Task B checks that the file meets all requirements to be an active
swap file - it has NOCOW set, there are no snapshots for the inode's
root at the moment, no file holes, no reflinked extents, etc;
4) Task B returns success and now the file is an active swap file;
5) Task A commits the transaction to create the snapshot and finishes.
The swap file's extents are now shared between the original root and
the snapshot;
6) A write into an extent of the swap file is attempted - there is a
snapshot of the file's root, so we fall back to COW mode and therefore
the physical location of the extent changes on disk.
So fix this by taking the snapshot lock during swap file activation before
locking the extent range, as that is the order in which we lock these
during buffered writes.
Fixes: ed46ff3d42378 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-02-05 12:55:38 +00:00
|
|
|
btrfs_drew_write_unlock(&root->snapshot_lock);
|
|
|
|
|
|
2020-08-25 10:02:32 -05:00
|
|
|
btrfs_exclop_finish(fs_info);
|
2016-11-03 10:28:14 -07:00
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
if (device)
|
|
|
|
|
sis->bdev = device->bdev;
|
|
|
|
|
*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
|
|
|
|
|
sis->max = bsi.nr_pages;
|
|
|
|
|
sis->pages = bsi.nr_pages - 1;
|
|
|
|
|
sis->highest_bit = bsi.nr_pages - 1;
|
|
|
|
|
return bsi.nr_extents;
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
static void btrfs_swap_deactivate(struct file *file)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
|
|
|
|
sector_t *span)
|
|
|
|
|
{
|
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:34 +00:00
|
|
|
/*
|
|
|
|
|
* Update the number of bytes used in the VFS' inode. When we replace extents in
|
|
|
|
|
* a range (clone, dedupe, fallocate's zero range), we must update the number of
|
|
|
|
|
* bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
|
|
|
|
|
* always get a correct value.
|
|
|
|
|
*/
|
|
|
|
|
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
|
|
|
|
|
const u64 add_bytes,
|
|
|
|
|
const u64 del_bytes)
|
|
|
|
|
{
|
|
|
|
|
if (add_bytes == del_bytes)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
spin_lock(&inode->lock);
|
|
|
|
|
if (del_bytes > 0)
|
|
|
|
|
inode_sub_bytes(&inode->vfs_inode, del_bytes);
|
|
|
|
|
if (add_bytes > 0)
|
|
|
|
|
inode_add_bytes(&inode->vfs_inode, add_bytes);
|
|
|
|
|
spin_unlock(&inode->lock);
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-15 15:22:41 +00:00
|
|
|
/**
|
|
|
|
|
* Verify that there are no ordered extents for a given file range.
|
|
|
|
|
*
|
|
|
|
|
* @inode: The target inode.
|
|
|
|
|
* @start: Start offset of the file range, should be sector size aligned.
|
|
|
|
|
* @end: End offset (inclusive) of the file range, its value +1 should be
|
|
|
|
|
* sector size aligned.
|
|
|
|
|
*
|
|
|
|
|
* This should typically be used for cases where we locked an inode's VFS lock in
|
|
|
|
|
* exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
|
|
|
|
|
* we have flushed all delalloc in the range, we have waited for all ordered
|
|
|
|
|
* extents in the range to complete and finally we have locked the file range in
|
|
|
|
|
* the inode's io_tree.
|
|
|
|
|
*/
|
|
|
|
|
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
|
|
|
|
|
{
|
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
|
|
|
|
|
if (ordered) {
|
|
|
|
|
btrfs_err(root->fs_info,
|
|
|
|
|
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
|
|
|
|
|
start, end, btrfs_ino(inode), root->root_key.objectid,
|
|
|
|
|
ordered->file_offset,
|
|
|
|
|
ordered->file_offset + ordered->num_bytes - 1);
|
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(ordered == NULL);
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-21 17:01:11 -07:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations = {
|
2008-11-17 20:42:26 -05:00
|
|
|
.getattr = btrfs_getattr,
|
2007-06-12 06:35:45 -04:00
|
|
|
.lookup = btrfs_lookup,
|
|
|
|
|
.create = btrfs_create,
|
|
|
|
|
.unlink = btrfs_unlink,
|
|
|
|
|
.link = btrfs_link,
|
|
|
|
|
.mkdir = btrfs_mkdir,
|
|
|
|
|
.rmdir = btrfs_rmdir,
|
2016-09-27 11:03:58 +02:00
|
|
|
.rename = btrfs_rename2,
|
2007-06-12 06:35:45 -04:00
|
|
|
.symlink = btrfs_symlink,
|
|
|
|
|
.setattr = btrfs_setattr,
|
2007-07-11 10:18:17 -04:00
|
|
|
.mknod = btrfs_mknod,
|
2007-11-16 11:45:54 -05:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-01-14 13:26:08 -05:00
|
|
|
.permission = btrfs_permission,
|
2011-07-23 17:37:31 +02:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 05:16:43 -08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2013-09-16 10:42:03 -07:00
|
|
|
.update_time = btrfs_update_time,
|
2014-04-27 20:40:45 +01:00
|
|
|
.tmpfile = btrfs_tmpfile,
|
2021-04-07 14:36:43 +02:00
|
|
|
.fileattr_get = btrfs_fileattr_get,
|
|
|
|
|
.fileattr_set = btrfs_fileattr_set,
|
2007-06-12 06:35:45 -04:00
|
|
|
};
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2009-10-01 15:43:56 -07:00
|
|
|
static const struct file_operations btrfs_dir_file_operations = {
|
2007-06-12 06:35:45 -04:00
|
|
|
.llseek = generic_file_llseek,
|
|
|
|
|
.read = generic_read_dir,
|
2016-05-20 13:50:33 -07:00
|
|
|
.iterate_shared = btrfs_real_readdir,
|
2017-07-24 15:14:25 -04:00
|
|
|
.open = btrfs_opendir,
|
2007-09-14 10:22:47 -04:00
|
|
|
.unlocked_ioctl = btrfs_ioctl,
|
2007-06-12 06:35:45 -04:00
|
|
|
#ifdef CONFIG_COMPAT
|
2015-10-29 08:22:21 +00:00
|
|
|
.compat_ioctl = btrfs_compat_ioctl,
|
2007-06-12 06:35:45 -04:00
|
|
|
#endif
|
2008-06-10 10:07:39 -04:00
|
|
|
.release = btrfs_release_file,
|
2008-09-05 16:13:11 -04:00
|
|
|
.fsync = btrfs_sync_file,
|
2007-06-12 06:35:45 -04:00
|
|
|
};
|
|
|
|
|
|
2009-01-21 13:11:13 -05:00
|
|
|
/*
|
|
|
|
|
* btrfs doesn't support the bmap operation because swapfiles
|
|
|
|
|
* use bmap to make a mapping of extents in the file. They assume
|
|
|
|
|
* these extents won't change over the life of the file and they
|
|
|
|
|
* use the bmap result to do IO directly to the drive.
|
|
|
|
|
*
|
|
|
|
|
* the btrfs bmap call would return logical addresses that aren't
|
|
|
|
|
* suitable for IO and they also will change frequently as COW
|
|
|
|
|
* operations happen. So, swapfile + btrfs == corruption.
|
|
|
|
|
*
|
|
|
|
|
* For now we're avoiding this by dropping bmap.
|
|
|
|
|
*/
|
2009-09-21 17:01:10 -07:00
|
|
|
static const struct address_space_operations btrfs_aops = {
|
2022-04-29 11:12:16 -04:00
|
|
|
.read_folio = btrfs_read_folio,
|
2007-11-01 19:45:34 -04:00
|
|
|
.writepages = btrfs_writepages,
|
2020-06-01 21:47:05 -07:00
|
|
|
.readahead = btrfs_readahead,
|
2020-08-17 11:18:21 -05:00
|
|
|
.direct_IO = noop_direct_IO,
|
2022-02-09 20:21:39 +00:00
|
|
|
.invalidate_folio = btrfs_invalidate_folio,
|
2022-04-30 23:15:16 -04:00
|
|
|
.release_folio = btrfs_release_folio,
|
2022-06-06 10:47:21 -04:00
|
|
|
.migrate_folio = btrfs_migrate_folio,
|
2022-02-09 20:22:03 +00:00
|
|
|
.dirty_folio = filemap_dirty_folio,
|
2009-09-16 11:50:18 +02:00
|
|
|
.error_remove_page = generic_error_remove_page,
|
2016-11-03 10:28:14 -07:00
|
|
|
.swap_activate = btrfs_swap_activate,
|
|
|
|
|
.swap_deactivate = btrfs_swap_deactivate,
|
2007-06-12 06:35:45 -04:00
|
|
|
};
|
|
|
|
|
|
2009-09-21 17:01:11 -07:00
|
|
|
static const struct inode_operations btrfs_file_inode_operations = {
|
2007-06-12 06:35:45 -04:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
|
.setattr = btrfs_setattr,
|
2007-11-16 11:45:54 -05:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-01-14 13:26:08 -05:00
|
|
|
.permission = btrfs_permission,
|
2009-01-21 14:39:14 -05:00
|
|
|
.fiemap = btrfs_fiemap,
|
2011-07-23 17:37:31 +02:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 05:16:43 -08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2012-03-26 09:46:47 -04:00
|
|
|
.update_time = btrfs_update_time,
|
2021-04-07 14:36:43 +02:00
|
|
|
.fileattr_get = btrfs_fileattr_get,
|
|
|
|
|
.fileattr_set = btrfs_fileattr_set,
|
2007-06-12 06:35:45 -04:00
|
|
|
};
|
2009-09-21 17:01:11 -07:00
|
|
|
static const struct inode_operations btrfs_special_inode_operations = {
|
2007-07-11 10:18:17 -04:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
|
.setattr = btrfs_setattr,
|
2008-01-14 13:26:08 -05:00
|
|
|
.permission = btrfs_permission,
|
2008-07-24 12:16:36 -04:00
|
|
|
.listxattr = btrfs_listxattr,
|
2011-07-23 17:37:31 +02:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 05:16:43 -08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2012-03-26 09:46:47 -04:00
|
|
|
.update_time = btrfs_update_time,
|
2007-07-11 10:18:17 -04:00
|
|
|
};
|
2009-09-21 17:01:11 -07:00
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations = {
|
2015-11-17 10:20:54 -05:00
|
|
|
.get_link = page_get_link,
|
2010-11-19 02:05:24 +00:00
|
|
|
.getattr = btrfs_getattr,
|
2011-11-30 10:45:38 -05:00
|
|
|
.setattr = btrfs_setattr,
|
2008-01-14 13:26:08 -05:00
|
|
|
.permission = btrfs_permission,
|
2009-02-04 09:29:13 -05:00
|
|
|
.listxattr = btrfs_listxattr,
|
2012-03-26 09:46:47 -04:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 06:35:45 -04:00
|
|
|
};
|
2009-09-21 16:00:26 -04:00
|
|
|
|
2009-10-09 09:54:36 -04:00
|
|
|
const struct dentry_operations btrfs_dentry_operations = {
|
2009-09-21 16:00:26 -04:00
|
|
|
.d_delete = btrfs_dentry_delete,
|
|
|
|
|
};
|