2018-04-03 17:23:33 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2007-06-12 13:07:21 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
2008-04-25 20:53:30 +00:00
|
|
|
#include <linux/kernel.h>
|
2008-02-20 17:07:25 +00:00
|
|
|
#include <linux/bio.h>
|
2007-06-12 10:35:45 +00:00
|
|
|
#include <linux/buffer_head.h>
|
2008-05-02 18:43:14 +00:00
|
|
|
#include <linux/file.h>
|
2007-06-12 10:35:45 +00:00
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/backing-dev.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/compat.h>
|
2007-11-16 16:45:54 +00:00
|
|
|
#include <linux/xattr.h>
|
2008-07-24 16:16:36 +00:00
|
|
|
#include <linux/posix_acl.h>
|
2008-10-30 18:25:28 +00:00
|
|
|
#include <linux/falloc.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
|
|
|
#include <linux/slab.h>
|
2011-05-06 13:33:15 +00:00
|
|
|
#include <linux/ratelimit.h>
|
2013-01-29 06:04:50 +00:00
|
|
|
#include <linux/btrfs.h>
|
2013-01-29 23:40:14 +00:00
|
|
|
#include <linux/blkdev.h>
|
2013-06-19 14:16:26 +00:00
|
|
|
#include <linux/posix_acl_xattr.h>
|
2015-02-22 16:58:50 +00:00
|
|
|
#include <linux/uio.h>
|
2017-10-19 18:15:57 +00:00
|
|
|
#include <linux/magic.h>
|
2018-01-29 11:41:30 +00:00
|
|
|
#include <linux/iversion.h>
|
2018-04-16 19:10:14 +00:00
|
|
|
#include <asm/unaligned.h>
|
2007-06-12 10:35:45 +00:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "btrfs_inode.h"
|
|
|
|
#include "print-tree.h"
|
2008-07-17 16:53:50 +00:00
|
|
|
#include "ordered-data.h"
|
2008-08-28 10:21:17 +00:00
|
|
|
#include "xattr.h"
|
2008-09-05 20:13:11 +00:00
|
|
|
#include "tree-log.h"
|
2011-07-22 13:41:52 +00:00
|
|
|
#include "volumes.h"
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
#include "compression.h"
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 14:25:08 +00:00
|
|
|
#include "locking.h"
|
2011-01-28 22:05:48 +00:00
|
|
|
#include "free-space-cache.h"
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
#include "inode-map.h"
|
2013-01-29 03:18:40 +00:00
|
|
|
#include "backref.h"
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
#include "props.h"
|
2014-12-12 08:44:35 +00:00
|
|
|
#include "qgroup.h"
|
2016-07-11 03:05:29 +00:00
|
|
|
#include "dedupe.h"
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
struct btrfs_iget_args {
|
2014-01-10 01:28:00 +00:00
|
|
|
struct btrfs_key *location;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root;
|
|
|
|
};
|
|
|
|
|
2015-12-08 19:23:20 +00:00
|
|
|
struct btrfs_dio_data {
|
|
|
|
u64 reserve;
|
|
|
|
u64 unsubmitted_oe_range_start;
|
|
|
|
u64 unsubmitted_oe_range_end;
|
2016-12-15 06:36:05 +00:00
|
|
|
int overwrite;
|
2015-12-08 19:23:20 +00:00
|
|
|
};
|
|
|
|
|
2009-09-22 00:01:11 +00:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_special_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_file_inode_operations;
|
2009-09-22 00:01:10 +00:00
|
|
|
static const struct address_space_operations btrfs_aops;
|
|
|
|
static const struct address_space_operations btrfs_symlink_aops;
|
2009-10-01 22:43:56 +00:00
|
|
|
static const struct file_operations btrfs_dir_file_operations;
|
2015-11-19 10:42:28 +00:00
|
|
|
static const struct extent_io_ops btrfs_extent_io_ops;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
static struct kmem_cache *btrfs_inode_cachep;
|
|
|
|
struct kmem_cache *btrfs_trans_handle_cachep;
|
|
|
|
struct kmem_cache *btrfs_path_cachep;
|
2011-01-28 22:05:48 +00:00
|
|
|
struct kmem_cache *btrfs_free_space_cachep;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
#define S_SHIFT 12
|
2015-11-19 10:42:31 +00:00
|
|
|
static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
|
2007-06-12 10:35:45 +00:00
|
|
|
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
|
|
|
|
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
|
|
|
|
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
|
|
|
|
[S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
|
|
|
|
[S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
|
|
|
|
[S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
|
|
|
|
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
|
|
|
|
};
|
|
|
|
|
2013-01-12 02:57:22 +00:00
|
|
|
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
|
2018-02-06 20:40:31 +00:00
|
|
|
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
|
2012-05-02 18:00:54 +00:00
|
|
|
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
|
2008-11-07 03:02:51 +00:00
|
|
|
static noinline int cow_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
2016-07-11 03:05:29 +00:00
|
|
|
u64 start, u64 end, u64 delalloc_end,
|
|
|
|
int *page_started, unsigned long *nr_written,
|
|
|
|
int unlock, struct btrfs_dedupe_hash *hash);
|
2017-01-31 15:50:22 +00:00
|
|
|
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
|
|
|
|
u64 orig_start, u64 block_start,
|
|
|
|
u64 block_len, u64 orig_block_len,
|
|
|
|
u64 ram_bytes, int compress_type,
|
|
|
|
int type);
|
2008-07-24 16:17:14 +00:00
|
|
|
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
static void __endio_write_update_ordered(struct inode *inode,
|
|
|
|
const u64 offset, const u64 bytes,
|
|
|
|
const bool uptodate);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cleanup all submitted ordered extents in specified range to handle errors
|
|
|
|
* from the fill_dellaloc() callback.
|
|
|
|
*
|
|
|
|
* NOTE: caller must ensure that when an error happens, it can not call
|
|
|
|
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
|
|
|
|
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
|
|
|
|
* to be released, which we want to happen only when finishing the ordered
|
|
|
|
* extent (btrfs_finish_ordered_io()). Also note that the caller of the
|
|
|
|
* fill_delalloc() callback already does proper cleanup for the first page of
|
|
|
|
* the range, that is, it invokes the callback writepage_end_io_hook() for the
|
|
|
|
* range of the first page.
|
|
|
|
*/
|
|
|
|
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
|
|
|
|
const u64 offset,
|
|
|
|
const u64 bytes)
|
|
|
|
{
|
2017-09-01 08:58:47 +00:00
|
|
|
unsigned long index = offset >> PAGE_SHIFT;
|
|
|
|
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
while (index <= end_index) {
|
|
|
|
page = find_get_page(inode->i_mapping, index);
|
|
|
|
index++;
|
|
|
|
if (!page)
|
|
|
|
continue;
|
|
|
|
ClearPagePrivate2(page);
|
|
|
|
put_page(page);
|
|
|
|
}
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
|
|
|
|
bytes - PAGE_SIZE, false);
|
|
|
|
}
|
|
|
|
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_dirty_inode(struct inode *inode);
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2015-03-16 21:38:52 +00:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
void btrfs_test_inode_set_ops(struct inode *inode)
|
|
|
|
{
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-11-12 09:35:27 +00:00
|
|
|
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
|
2011-02-01 16:05:39 +00:00
|
|
|
struct inode *inode, struct inode *dir,
|
|
|
|
const struct qstr *qstr)
|
2009-02-04 14:29:13 +00:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2009-11-12 09:35:27 +00:00
|
|
|
err = btrfs_init_acl(trans, inode, dir);
|
2009-02-04 14:29:13 +00:00
|
|
|
if (!err)
|
2011-02-01 16:05:39 +00:00
|
|
|
err = btrfs_xattr_security_init(trans, inode, dir, qstr);
|
2009-02-04 14:29:13 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
/*
|
|
|
|
* this does all the hard work for inserting an inline extent into
|
|
|
|
* the btree. The caller should have done a btrfs_drop_extents so that
|
|
|
|
* no overlapping inline items exist in the btree
|
|
|
|
*/
|
2014-05-21 20:35:51 +00:00
|
|
|
static int insert_inline_extent(struct btrfs_trans_handle *trans,
|
2014-01-07 11:42:27 +00:00
|
|
|
struct btrfs_path *path, int extent_inserted,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
struct btrfs_root *root, struct inode *inode,
|
|
|
|
u64 start, size_t size, size_t compressed_size,
|
2011-03-28 08:30:38 +00:00
|
|
|
int compress_type,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
struct page **compressed_pages)
|
|
|
|
{
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct page *page = NULL;
|
|
|
|
char *kaddr;
|
|
|
|
unsigned long ptr;
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
int ret;
|
|
|
|
size_t cur_size = size;
|
|
|
|
unsigned long offset;
|
|
|
|
|
2011-03-28 08:30:38 +00:00
|
|
|
if (compressed_size && compressed_pages)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
cur_size = compressed_size;
|
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
inode_add_bytes(inode, size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
if (!extent_inserted) {
|
|
|
|
struct btrfs_key key;
|
|
|
|
size_t datasize;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2017-01-10 18:35:31 +00:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2014-01-07 11:42:27 +00:00
|
|
|
key.offset = start;
|
2014-06-04 16:41:45 +00:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
datasize = btrfs_file_extent_calc_inline_size(cur_size);
|
|
|
|
path->leave_spinning = 1;
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
datasize);
|
2017-06-15 17:09:51 +00:00
|
|
|
if (ret)
|
2014-01-07 11:42:27 +00:00
|
|
|
goto fail;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
|
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
|
|
|
|
2010-12-17 06:21:50 +00:00
|
|
|
if (compress_type != BTRFS_COMPRESS_NONE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
struct page *cpage;
|
|
|
|
int i = 0;
|
2009-01-06 02:25:51 +00:00
|
|
|
while (compressed_size > 0) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
cpage = compressed_pages[i];
|
2008-11-11 14:34:41 +00:00
|
|
|
cur_size = min_t(unsigned long, compressed_size,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
PAGE_SIZE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2011-11-25 15:14:28 +00:00
|
|
|
kaddr = kmap_atomic(cpage);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
write_extent_buffer(leaf, kaddr, ptr, cur_size);
|
2011-11-25 15:14:28 +00:00
|
|
|
kunmap_atomic(kaddr);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
i++;
|
|
|
|
ptr += cur_size;
|
|
|
|
compressed_size -= cur_size;
|
|
|
|
}
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei,
|
2010-12-17 06:21:50 +00:00
|
|
|
compress_type);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
} else {
|
|
|
|
page = find_get_page(inode->i_mapping,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
start >> PAGE_SHIFT);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
2011-11-25 15:14:28 +00:00
|
|
|
kaddr = kmap_atomic(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
offset = start & (PAGE_SIZE - 1);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
write_extent_buffer(leaf, kaddr + offset, ptr, size);
|
2011-11-25 15:14:28 +00:00
|
|
|
kunmap_atomic(kaddr);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2014-01-07 11:42:27 +00:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2009-11-12 09:34:21 +00:00
|
|
|
/*
|
|
|
|
* we're an inline extent, so nobody can
|
|
|
|
* extend the file past i_size without locking
|
|
|
|
* a page we already have locked.
|
|
|
|
*
|
|
|
|
* We must do any isize and inode updates
|
|
|
|
* before we unlock the pages. Otherwise we
|
|
|
|
* could end up racing with unlink.
|
|
|
|
*/
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
BTRFS_I(inode)->disk_i_size = inode->i_size;
|
2012-03-12 15:03:00 +00:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2009-11-12 09:34:21 +00:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
fail:
|
2017-06-15 17:09:51 +00:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* conditionally insert an inline extent into the file. This
|
|
|
|
* does the checks required to make sure the data is small enough
|
|
|
|
* to fit as an inline extent.
|
|
|
|
*/
|
2018-03-02 07:43:15 +00:00
|
|
|
static noinline int cow_file_range_inline(struct inode *inode, u64 start,
|
2013-08-14 18:02:47 +00:00
|
|
|
u64 end, size_t compressed_size,
|
|
|
|
int compress_type,
|
|
|
|
struct page **compressed_pages)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
{
|
2018-03-02 07:43:15 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2013-08-14 18:02:47 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
u64 isize = i_size_read(inode);
|
|
|
|
u64 actual_end = min(end + 1, isize);
|
|
|
|
u64 inline_len = actual_end - start;
|
2016-06-22 22:54:23 +00:00
|
|
|
u64 aligned_end = ALIGN(end, fs_info->sectorsize);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
u64 data_len = inline_len;
|
|
|
|
int ret;
|
2014-01-07 11:42:27 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
int extent_inserted = 0;
|
|
|
|
u32 extent_item_size;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
if (compressed_size)
|
|
|
|
data_len = compressed_size;
|
|
|
|
|
|
|
|
if (start > 0 ||
|
2016-06-22 22:54:23 +00:00
|
|
|
actual_end > fs_info->sectorsize ||
|
|
|
|
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
(!compressed_size &&
|
2016-06-22 22:54:23 +00:00
|
|
|
(actual_end & (fs_info->sectorsize - 1)) == 0) ||
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
end + 1 < isize ||
|
2016-06-22 22:54:23 +00:00
|
|
|
data_len > fs_info->max_inline) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-08-14 18:02:47 +00:00
|
|
|
trans = btrfs_join_transaction(root);
|
2014-01-07 11:42:27 +00:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
btrfs_free_path(path);
|
2013-08-14 18:02:47 +00:00
|
|
|
return PTR_ERR(trans);
|
2014-01-07 11:42:27 +00:00
|
|
|
}
|
2017-10-19 18:15:57 +00:00
|
|
|
trans->block_rsv = &BTRFS_I(inode)->block_rsv;
|
2013-08-14 18:02:47 +00:00
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
if (compressed_size && compressed_pages)
|
|
|
|
extent_item_size = btrfs_file_extent_calc_inline_size(
|
|
|
|
compressed_size);
|
|
|
|
else
|
|
|
|
extent_item_size = btrfs_file_extent_calc_inline_size(
|
|
|
|
inline_len);
|
|
|
|
|
|
|
|
ret = __btrfs_drop_extents(trans, root, inode, path,
|
|
|
|
start, aligned_end, NULL,
|
|
|
|
1, 1, extent_item_size, &extent_inserted);
|
2013-08-14 18:02:47 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-08-14 18:02:47 +00:00
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
if (isize > actual_end)
|
|
|
|
inline_len = min_t(u64, isize, actual_end);
|
2014-01-07 11:42:27 +00:00
|
|
|
ret = insert_inline_extent(trans, path, extent_inserted,
|
|
|
|
root, inode, start,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
inline_len, compressed_size,
|
2011-03-28 08:30:38 +00:00
|
|
|
compress_type, compressed_pages);
|
2012-05-23 20:10:14 +00:00
|
|
|
if (ret && ret != -ENOSPC) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-08-14 18:02:47 +00:00
|
|
|
goto out;
|
2012-05-23 20:10:14 +00:00
|
|
|
} else if (ret == -ENOSPC) {
|
2013-08-14 18:02:47 +00:00
|
|
|
ret = 1;
|
|
|
|
goto out;
|
2012-03-12 15:03:00 +00:00
|
|
|
}
|
2012-05-23 20:10:14 +00:00
|
|
|
|
2013-02-28 18:23:38 +00:00
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
|
2013-08-14 18:02:47 +00:00
|
|
|
out:
|
2015-09-08 09:25:56 +00:00
|
|
|
/*
|
|
|
|
* Don't forget to free the reserved space, as for inlined extent
|
|
|
|
* it won't count as data extent, free them directly here.
|
|
|
|
* And at reserve time, it's always aligned to page size, so
|
|
|
|
* just free one page here.
|
|
|
|
*/
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
|
2014-01-07 11:42:27 +00:00
|
|
|
btrfs_free_path(path);
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2013-08-14 18:02:47 +00:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
struct async_extent {
|
|
|
|
u64 start;
|
|
|
|
u64 ram_size;
|
|
|
|
u64 compressed_size;
|
|
|
|
struct page **pages;
|
|
|
|
unsigned long nr_pages;
|
2010-12-17 06:21:50 +00:00
|
|
|
int compress_type;
|
2008-11-07 03:02:51 +00:00
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct async_cow {
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct page *locked_page;
|
|
|
|
u64 start;
|
|
|
|
u64 end;
|
2017-10-24 05:18:16 +00:00
|
|
|
unsigned int write_flags;
|
2008-11-07 03:02:51 +00:00
|
|
|
struct list_head extents;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
|
|
|
static noinline int add_async_extent(struct async_cow *cow,
|
|
|
|
u64 start, u64 ram_size,
|
|
|
|
u64 compressed_size,
|
|
|
|
struct page **pages,
|
2010-12-17 06:21:50 +00:00
|
|
|
unsigned long nr_pages,
|
|
|
|
int compress_type)
|
2008-11-07 03:02:51 +00:00
|
|
|
{
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
|
|
|
|
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(!async_extent); /* -ENOMEM */
|
2008-11-07 03:02:51 +00:00
|
|
|
async_extent->start = start;
|
|
|
|
async_extent->ram_size = ram_size;
|
|
|
|
async_extent->compressed_size = compressed_size;
|
|
|
|
async_extent->pages = pages;
|
|
|
|
async_extent->nr_pages = nr_pages;
|
2010-12-17 06:21:50 +00:00
|
|
|
async_extent->compress_type = compress_type;
|
2008-11-07 03:02:51 +00:00
|
|
|
list_add_tail(&async_extent->list, &cow->extents);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-17 13:52:58 +00:00
|
|
|
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
|
2014-07-17 03:44:09 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2014-07-17 03:44:09 +00:00
|
|
|
|
|
|
|
/* force compress */
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
|
2014-07-17 03:44:09 +00:00
|
|
|
return 1;
|
2017-07-17 17:41:31 +00:00
|
|
|
/* defrag ioctl */
|
|
|
|
if (BTRFS_I(inode)->defrag_compress)
|
|
|
|
return 1;
|
2014-07-17 03:44:09 +00:00
|
|
|
/* bad compression ratios */
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
|
|
|
|
return 0;
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_test_opt(fs_info, COMPRESS) ||
|
2014-07-17 03:44:09 +00:00
|
|
|
BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
|
2017-07-17 17:17:20 +00:00
|
|
|
BTRFS_I(inode)->prop_compress)
|
2017-07-17 13:52:58 +00:00
|
|
|
return btrfs_compress_heuristic(inode, start, end);
|
2014-07-17 03:44:09 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:50:43 +00:00
|
|
|
static inline void inode_should_defrag(struct btrfs_inode *inode,
|
2016-12-19 11:09:06 +00:00
|
|
|
u64 start, u64 end, u64 num_bytes, u64 small_write)
|
|
|
|
{
|
|
|
|
/* If this is a small write inside eof, kick off a defrag */
|
|
|
|
if (num_bytes < small_write &&
|
2017-02-20 11:50:43 +00:00
|
|
|
(start > 0 || end + 1 < inode->disk_i_size))
|
2016-12-19 11:09:06 +00:00
|
|
|
btrfs_add_inode_defrag(NULL, inode);
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
2008-11-07 03:02:51 +00:00
|
|
|
* we create compressed extents in two phases. The first
|
|
|
|
* phase compresses a range of pages that have already been
|
|
|
|
* locked (both pages and state bits are locked).
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
*
|
2008-11-07 03:02:51 +00:00
|
|
|
* This is done inside an ordered work queue, and the compression
|
|
|
|
* is spread across many cpus. The actual IO submission is step
|
|
|
|
* two, and the ordered work queue takes care of making sure that
|
|
|
|
* happens in the same order things were put onto the queue by
|
|
|
|
* writepages and friends.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
*
|
2008-11-07 03:02:51 +00:00
|
|
|
* If this code finds it can't get good compression, it puts an
|
|
|
|
* entry onto the work queue to write the uncompressed bytes. This
|
|
|
|
* makes sure that both compressed inodes and uncompressed inodes
|
2012-07-25 15:12:06 +00:00
|
|
|
* are written in the same order that the flusher thread sent them
|
|
|
|
* down.
|
2008-09-29 19:18:18 +00:00
|
|
|
*/
|
2014-10-09 20:15:44 +00:00
|
|
|
static noinline void compress_file_range(struct inode *inode,
|
2008-11-07 03:02:51 +00:00
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end,
|
|
|
|
struct async_cow *async_cow,
|
|
|
|
int *num_added)
|
2007-08-27 20:49:44 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
u64 blocksize = fs_info->sectorsize;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
u64 actual_end;
|
2008-12-15 16:44:56 +00:00
|
|
|
u64 isize = i_size_read(inode);
|
2008-07-17 16:53:50 +00:00
|
|
|
int ret = 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
struct page **pages = NULL;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
unsigned long total_compressed = 0;
|
|
|
|
unsigned long total_in = 0;
|
|
|
|
int i;
|
|
|
|
int will_compress;
|
2016-06-22 22:54:23 +00:00
|
|
|
int compress_type = fs_info->compress_type;
|
2013-03-26 17:07:00 +00:00
|
|
|
int redirty = 0;
|
2007-08-27 20:49:44 +00:00
|
|
|
|
2017-02-20 11:50:43 +00:00
|
|
|
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
|
|
|
|
SZ_16K);
|
2011-05-24 19:35:30 +00:00
|
|
|
|
2008-12-15 16:44:56 +00:00
|
|
|
actual_end = min_t(u64, isize, end + 1);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
again:
|
|
|
|
will_compress = 0;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
|
2017-02-14 18:36:54 +00:00
|
|
|
BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
|
|
|
|
nr_pages = min_t(unsigned long, nr_pages,
|
|
|
|
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
|
2007-12-18 01:14:01 +00:00
|
|
|
|
2009-02-04 14:31:06 +00:00
|
|
|
/*
|
|
|
|
* we don't want to send crud past the end of i_size through
|
|
|
|
* compression, that's just a waste of CPU time. So, if the
|
|
|
|
* end of the file is before the start of our current
|
|
|
|
* requested range of bytes, we bail out to the uncompressed
|
|
|
|
* cleanup code that can deal with all of this.
|
|
|
|
*
|
|
|
|
* It isn't really the fastest way to fix things, but this is a
|
|
|
|
* very uncommon corner.
|
|
|
|
*/
|
|
|
|
if (actual_end <= start)
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
total_compressed = actual_end - start;
|
|
|
|
|
2014-10-07 22:44:35 +00:00
|
|
|
/*
|
|
|
|
* skip compression for a small file range(<=blocksize) that
|
2016-05-20 01:18:45 +00:00
|
|
|
* isn't an inline extent, since it doesn't save disk space at all.
|
2014-10-07 22:44:35 +00:00
|
|
|
*/
|
|
|
|
if (total_compressed <= blocksize &&
|
|
|
|
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
2017-02-14 18:36:54 +00:00
|
|
|
total_compressed = min_t(unsigned long, total_compressed,
|
|
|
|
BTRFS_MAX_UNCOMPRESSED);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
total_in = 0;
|
|
|
|
ret = 0;
|
2007-10-15 20:15:53 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
/*
|
|
|
|
* we do compression for mount -o compress and when the
|
|
|
|
* inode has not been flagged as nocompress. This flag can
|
|
|
|
* change at any time if we discover bad compression ratios.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
*/
|
2017-07-17 13:52:58 +00:00
|
|
|
if (inode_need_compress(inode, start, end)) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
WARN_ON(pages);
|
2015-02-20 17:00:26 +00:00
|
|
|
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
|
2011-09-08 02:22:01 +00:00
|
|
|
if (!pages) {
|
|
|
|
/* just bail out to the uncompressed code */
|
|
|
|
goto cont;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2017-07-17 17:41:31 +00:00
|
|
|
if (BTRFS_I(inode)->defrag_compress)
|
|
|
|
compress_type = BTRFS_I(inode)->defrag_compress;
|
|
|
|
else if (BTRFS_I(inode)->prop_compress)
|
2017-07-17 17:17:20 +00:00
|
|
|
compress_type = BTRFS_I(inode)->prop_compress;
|
2010-12-17 06:21:50 +00:00
|
|
|
|
2013-03-26 17:07:00 +00:00
|
|
|
/*
|
|
|
|
* we need to call clear_page_dirty_for_io on each
|
|
|
|
* page in the range. Otherwise applications with the file
|
|
|
|
* mmap'd can wander in and change the page contents while
|
|
|
|
* we are compressing them.
|
|
|
|
*
|
|
|
|
* If the compression fails for any reason, we set the pages
|
|
|
|
* dirty again later on.
|
2017-10-23 22:29:48 +00:00
|
|
|
*
|
|
|
|
* Note that the remaining part is redirtied, the start pointer
|
|
|
|
* has moved, the end is the original one.
|
2013-03-26 17:07:00 +00:00
|
|
|
*/
|
2017-10-23 22:29:48 +00:00
|
|
|
if (!redirty) {
|
|
|
|
extent_range_clear_dirty_for_io(inode, start, end);
|
|
|
|
redirty = 1;
|
|
|
|
}
|
2017-09-15 15:36:57 +00:00
|
|
|
|
|
|
|
/* Compression level is applied here and only here */
|
|
|
|
ret = btrfs_compress_pages(
|
|
|
|
compress_type | (fs_info->compress_level << 4),
|
2010-12-17 06:21:50 +00:00
|
|
|
inode->i_mapping, start,
|
2017-02-14 18:04:07 +00:00
|
|
|
pages,
|
2017-02-14 18:04:07 +00:00
|
|
|
&nr_pages,
|
2010-12-17 06:21:50 +00:00
|
|
|
&total_in,
|
2017-02-14 18:45:05 +00:00
|
|
|
&total_compressed);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
unsigned long offset = total_compressed &
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
(PAGE_SIZE - 1);
|
2017-02-14 18:04:07 +00:00
|
|
|
struct page *page = pages[nr_pages - 1];
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
char *kaddr;
|
|
|
|
|
|
|
|
/* zero the tail end of the last page, we might be
|
|
|
|
* sending it down to disk
|
|
|
|
*/
|
|
|
|
if (offset) {
|
2011-11-25 15:14:28 +00:00
|
|
|
kaddr = kmap_atomic(page);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
memset(kaddr + offset, 0,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
PAGE_SIZE - offset);
|
2011-11-25 15:14:28 +00:00
|
|
|
kunmap_atomic(kaddr);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
will_compress = 1;
|
|
|
|
}
|
|
|
|
}
|
2011-09-08 02:22:01 +00:00
|
|
|
cont:
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
if (start == 0) {
|
|
|
|
/* lets try to make an inline extent */
|
2017-09-14 22:57:26 +00:00
|
|
|
if (ret || total_in < actual_end) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
/* we didn't compress the entire range, try
|
2008-11-07 03:02:51 +00:00
|
|
|
* to make an uncompressed inline extent.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
*/
|
2018-03-02 07:43:15 +00:00
|
|
|
ret = cow_file_range_inline(inode, start, end, 0,
|
|
|
|
BTRFS_COMPRESS_NONE, NULL);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
} else {
|
2008-11-07 03:02:51 +00:00
|
|
|
/* try making a compressed inline extent */
|
2018-03-02 07:43:15 +00:00
|
|
|
ret = cow_file_range_inline(inode, start, end,
|
2011-03-28 08:30:38 +00:00
|
|
|
total_compressed,
|
|
|
|
compress_type, pages);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret <= 0) {
|
2013-07-29 17:22:24 +00:00
|
|
|
unsigned long clear_flags = EXTENT_DELALLOC |
|
2017-10-19 18:15:55 +00:00
|
|
|
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
|
|
|
|
EXTENT_DO_ACCOUNTING;
|
2014-10-10 09:45:12 +00:00
|
|
|
unsigned long page_error_op;
|
|
|
|
|
|
|
|
page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
|
2013-07-29 17:22:24 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
/*
|
2012-03-12 15:03:00 +00:00
|
|
|
* inline extent creation worked or returned error,
|
|
|
|
* we don't need to create any more async work items.
|
|
|
|
* Unlock and free up our temp pages.
|
2017-10-19 18:15:55 +00:00
|
|
|
*
|
|
|
|
* We use DO_ACCOUNTING here because we need the
|
|
|
|
* delalloc_release_metadata to be done _after_ we drop
|
|
|
|
* our outstanding extent for clearing delalloc for this
|
|
|
|
* range.
|
2008-11-07 03:02:51 +00:00
|
|
|
*/
|
2016-07-19 08:50:36 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, end,
|
|
|
|
NULL, clear_flags,
|
|
|
|
PAGE_UNLOCK |
|
2013-07-29 15:20:47 +00:00
|
|
|
PAGE_CLEAR_DIRTY |
|
|
|
|
PAGE_SET_WRITEBACK |
|
2014-10-10 09:45:12 +00:00
|
|
|
page_error_op |
|
2013-07-29 15:20:47 +00:00
|
|
|
PAGE_END_WRITEBACK);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
goto free_pages_out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (will_compress) {
|
|
|
|
/*
|
|
|
|
* we aren't doing an inline extent round the compressed size
|
|
|
|
* up to a block size boundary so the allocator does sane
|
|
|
|
* things
|
|
|
|
*/
|
2013-02-26 08:10:22 +00:00
|
|
|
total_compressed = ALIGN(total_compressed, blocksize);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* one last check to make sure the compression is really a
|
2017-06-06 11:41:15 +00:00
|
|
|
* win, compare the page count read with the blocks on disk,
|
|
|
|
* compression must free at least one sector size
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
*/
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
total_in = ALIGN(total_in, PAGE_SIZE);
|
2017-06-06 11:41:15 +00:00
|
|
|
if (total_compressed + blocksize <= total_in) {
|
2016-03-26 02:01:33 +00:00
|
|
|
*num_added += 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The async work queues will take care of doing actual
|
|
|
|
* allocation on disk for these compressed pages, and
|
|
|
|
* will submit them to the elevator.
|
|
|
|
*/
|
2017-10-03 15:06:01 +00:00
|
|
|
add_async_extent(async_cow, start, total_in,
|
2017-02-14 18:04:07 +00:00
|
|
|
total_compressed, pages, nr_pages,
|
2016-03-26 02:01:33 +00:00
|
|
|
compress_type);
|
|
|
|
|
2017-10-03 15:06:01 +00:00
|
|
|
if (start + total_in < end) {
|
|
|
|
start += total_in;
|
2016-03-26 02:01:33 +00:00
|
|
|
pages = NULL;
|
|
|
|
cond_resched();
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
return;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
}
|
2016-03-26 02:01:33 +00:00
|
|
|
if (pages) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
/*
|
|
|
|
* the compression code ran but failed to make things smaller,
|
|
|
|
* free any pages it allocated and our page pointer array
|
|
|
|
*/
|
2017-02-14 18:04:07 +00:00
|
|
|
for (i = 0; i < nr_pages; i++) {
|
2008-10-31 16:46:39 +00:00
|
|
|
WARN_ON(pages[i]->mapping);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(pages[i]);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
kfree(pages);
|
|
|
|
pages = NULL;
|
|
|
|
total_compressed = 0;
|
2017-02-14 18:04:07 +00:00
|
|
|
nr_pages = 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
/* flag the file so we don't compress in the future */
|
2016-06-22 22:54:23 +00:00
|
|
|
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
|
2017-07-17 17:17:20 +00:00
|
|
|
!(BTRFS_I(inode)->prop_compress)) {
|
2010-01-28 21:18:15 +00:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
|
2010-03-11 14:42:04 +00:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
2009-02-04 14:31:06 +00:00
|
|
|
cleanup_and_bail_uncompressed:
|
2016-03-26 02:01:33 +00:00
|
|
|
/*
|
|
|
|
* No compression, but we still need to write the pages in the file
|
|
|
|
* we've been given so far. redirty the locked page if it corresponds
|
|
|
|
* to our extent and set things up for the async work queue to run
|
|
|
|
* cow_file_range to do the normal delalloc dance.
|
|
|
|
*/
|
|
|
|
if (page_offset(locked_page) >= start &&
|
|
|
|
page_offset(locked_page) <= end)
|
|
|
|
__set_page_dirty_nobuffers(locked_page);
|
|
|
|
/* unlocked later on in the async handlers */
|
|
|
|
|
|
|
|
if (redirty)
|
|
|
|
extent_range_redirty_for_io(inode, start, end);
|
|
|
|
add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
|
|
|
|
BTRFS_COMPRESS_NONE);
|
|
|
|
*num_added += 1;
|
2008-04-17 15:29:12 +00:00
|
|
|
|
2014-10-09 20:15:44 +00:00
|
|
|
return;
|
2008-11-07 03:02:51 +00:00
|
|
|
|
|
|
|
free_pages_out:
|
2017-02-14 18:04:07 +00:00
|
|
|
for (i = 0; i < nr_pages; i++) {
|
2008-11-07 03:02:51 +00:00
|
|
|
WARN_ON(pages[i]->mapping);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(pages[i]);
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
2009-01-06 02:25:51 +00:00
|
|
|
kfree(pages);
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
|
|
|
|
2014-10-06 21:14:24 +00:00
|
|
|
static void free_async_extent_pages(struct async_extent *async_extent)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!async_extent->pages)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (i = 0; i < async_extent->nr_pages; i++) {
|
|
|
|
WARN_ON(async_extent->pages[i]->mapping);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(async_extent->pages[i]);
|
2014-10-06 21:14:24 +00:00
|
|
|
}
|
|
|
|
kfree(async_extent->pages);
|
|
|
|
async_extent->nr_pages = 0;
|
|
|
|
async_extent->pages = NULL;
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* phase two of compressed writeback. This is the ordered portion
|
|
|
|
* of the code, which only gets called in the order the work was
|
|
|
|
* queued. We walk all the async extents created by compress_file_range
|
|
|
|
* and send them down to the disk.
|
|
|
|
*/
|
2014-10-06 21:14:26 +00:00
|
|
|
static noinline void submit_compressed_extents(struct inode *inode,
|
2008-11-07 03:02:51 +00:00
|
|
|
struct async_cow *async_cow)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-11-07 03:02:51 +00:00
|
|
|
struct async_extent *async_extent;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_io_tree *io_tree;
|
2009-11-11 02:23:48 +00:00
|
|
|
int ret = 0;
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2013-02-06 21:49:15 +00:00
|
|
|
again:
|
2009-01-06 02:25:51 +00:00
|
|
|
while (!list_empty(&async_cow->extents)) {
|
2008-11-07 03:02:51 +00:00
|
|
|
async_extent = list_entry(async_cow->extents.next,
|
|
|
|
struct async_extent, list);
|
|
|
|
list_del(&async_extent->list);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
2009-11-11 02:23:48 +00:00
|
|
|
retry:
|
2008-11-07 03:02:51 +00:00
|
|
|
/* did the compression code fall back to uncompressed IO? */
|
|
|
|
if (!async_extent->pages) {
|
|
|
|
int page_started = 0;
|
|
|
|
unsigned long nr_written = 0;
|
|
|
|
|
|
|
|
lock_extent(io_tree, async_extent->start,
|
2010-02-03 19:33:23 +00:00
|
|
|
async_extent->start +
|
2012-03-01 13:57:19 +00:00
|
|
|
async_extent->ram_size - 1);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
|
|
|
/* allocate blocks */
|
2009-11-11 02:23:48 +00:00
|
|
|
ret = cow_file_range(inode, async_cow->locked_page,
|
|
|
|
async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2016-07-11 03:05:29 +00:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
|
|
|
&page_started, &nr_written, 0,
|
|
|
|
NULL);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2012-03-12 15:03:00 +00:00
|
|
|
/* JDM XXX */
|
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
/*
|
|
|
|
* if page_started, cow_file_range inserted an
|
|
|
|
* inline extent and took care of all the unlocking
|
|
|
|
* and IO for us. Otherwise, we need to submit
|
|
|
|
* all those pages down to the drive.
|
|
|
|
*/
|
2009-11-11 02:23:48 +00:00
|
|
|
if (!page_started && !ret)
|
2017-12-08 13:55:58 +00:00
|
|
|
extent_write_locked_range(inode,
|
|
|
|
async_extent->start,
|
2009-01-06 02:25:51 +00:00
|
|
|
async_extent->start +
|
2008-11-07 03:02:51 +00:00
|
|
|
async_extent->ram_size - 1,
|
|
|
|
WB_SYNC_ALL);
|
2013-02-06 21:49:15 +00:00
|
|
|
else if (ret)
|
|
|
|
unlock_page(async_cow->locked_page);
|
2008-11-07 03:02:51 +00:00
|
|
|
kfree(async_extent);
|
|
|
|
cond_resched();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_extent(io_tree, async_extent->start,
|
2012-03-01 13:57:19 +00:00
|
|
|
async_extent->start + async_extent->ram_size - 1);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
ret = btrfs_reserve_extent(root, async_extent->ram_size,
|
2008-11-07 03:02:51 +00:00
|
|
|
async_extent->compressed_size,
|
|
|
|
async_extent->compressed_size,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
0, alloc_hint, &ins, 1, 1);
|
2009-11-11 02:23:48 +00:00
|
|
|
if (ret) {
|
2014-10-06 21:14:24 +00:00
|
|
|
free_async_extent_pages(async_extent);
|
2013-02-06 21:49:15 +00:00
|
|
|
|
2013-06-14 20:58:23 +00:00
|
|
|
if (ret == -ENOSPC) {
|
|
|
|
unlock_extent(io_tree, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1);
|
2014-07-24 14:48:05 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we need to redirty the pages if we decide to
|
|
|
|
* fallback to uncompressed IO, otherwise we
|
|
|
|
* will not submit these pages down to lower
|
|
|
|
* layers.
|
|
|
|
*/
|
|
|
|
extent_range_redirty_for_io(inode,
|
|
|
|
async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1);
|
|
|
|
|
2012-03-12 15:03:00 +00:00
|
|
|
goto retry;
|
2013-06-14 20:58:23 +00:00
|
|
|
}
|
2013-02-06 21:49:15 +00:00
|
|
|
goto out_free;
|
2009-11-11 02:23:48 +00:00
|
|
|
}
|
2009-11-12 09:34:21 +00:00
|
|
|
/*
|
|
|
|
* here we're doing allocation and writeback of the
|
|
|
|
* compressed pages
|
|
|
|
*/
|
2017-01-31 15:50:22 +00:00
|
|
|
em = create_io_em(inode, async_extent->start,
|
|
|
|
async_extent->ram_size, /* len */
|
|
|
|
async_extent->start, /* orig_start */
|
|
|
|
ins.objectid, /* block_start */
|
|
|
|
ins.offset, /* block_len */
|
|
|
|
ins.offset, /* orig_block_len */
|
|
|
|
async_extent->ram_size, /* ram_bytes */
|
|
|
|
async_extent->compress_type,
|
|
|
|
BTRFS_ORDERED_COMPRESSED);
|
|
|
|
if (IS_ERR(em))
|
|
|
|
/* ret value is not necessary due to void function */
|
2013-02-06 21:49:15 +00:00
|
|
|
goto out_free_reserve;
|
2017-01-31 15:50:22 +00:00
|
|
|
free_extent_map(em);
|
2013-02-06 21:49:15 +00:00
|
|
|
|
2010-12-17 06:21:50 +00:00
|
|
|
ret = btrfs_add_ordered_extent_compress(inode,
|
|
|
|
async_extent->start,
|
|
|
|
ins.objectid,
|
|
|
|
async_extent->ram_size,
|
|
|
|
ins.offset,
|
|
|
|
BTRFS_ORDERED_COMPRESSED,
|
|
|
|
async_extent->compress_type);
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 09:43:00 +00:00
|
|
|
if (ret) {
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode),
|
|
|
|
async_extent->start,
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 09:43:00 +00:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, 0);
|
2013-02-06 21:49:15 +00:00
|
|
|
goto out_free_reserve;
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 09:43:00 +00:00
|
|
|
}
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* clear dirty, set writeback and unlock the pages.
|
|
|
|
*/
|
2013-07-29 15:20:47 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, async_extent->start,
|
2016-07-19 08:50:36 +00:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2009-10-08 15:27:10 +00:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2013-07-29 17:22:24 +00:00
|
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
|
|
|
|
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
|
2013-07-29 15:20:47 +00:00
|
|
|
PAGE_SET_WRITEBACK);
|
2017-06-03 07:38:06 +00:00
|
|
|
if (btrfs_submit_compressed_write(inode,
|
2009-01-06 02:25:51 +00:00
|
|
|
async_extent->start,
|
|
|
|
async_extent->ram_size,
|
|
|
|
ins.objectid,
|
|
|
|
ins.offset, async_extent->pages,
|
2017-10-24 05:18:16 +00:00
|
|
|
async_extent->nr_pages,
|
|
|
|
async_cow->write_flags)) {
|
2014-10-06 21:14:23 +00:00
|
|
|
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct page *p = async_extent->pages[0];
|
|
|
|
const u64 start = async_extent->start;
|
|
|
|
const u64 end = start + async_extent->ram_size - 1;
|
|
|
|
|
|
|
|
p->mapping = inode->i_mapping;
|
|
|
|
tree->ops->writepage_end_io_hook(p, start, end,
|
|
|
|
NULL, 0);
|
|
|
|
p->mapping = NULL;
|
2016-07-19 08:50:36 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, end,
|
|
|
|
NULL, 0,
|
2014-10-06 21:14:23 +00:00
|
|
|
PAGE_END_WRITEBACK |
|
|
|
|
PAGE_SET_ERROR);
|
2014-10-06 21:14:24 +00:00
|
|
|
free_async_extent_pages(async_extent);
|
2014-10-06 21:14:23 +00:00
|
|
|
}
|
2008-11-07 03:02:51 +00:00
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
kfree(async_extent);
|
|
|
|
cond_resched();
|
|
|
|
}
|
2014-10-06 21:14:26 +00:00
|
|
|
return;
|
2013-02-06 21:49:15 +00:00
|
|
|
out_free_reserve:
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
|
2012-03-12 15:03:00 +00:00
|
|
|
out_free:
|
2013-07-29 15:20:47 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, async_extent->start,
|
2016-07-19 08:50:36 +00:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2013-02-06 21:49:15 +00:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2013-07-29 15:20:47 +00:00
|
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
EXTENT_DELALLOC_NEW |
|
2013-07-29 17:22:24 +00:00
|
|
|
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
|
|
|
|
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
|
2014-10-06 21:14:22 +00:00
|
|
|
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
|
|
|
|
PAGE_SET_ERROR);
|
2014-10-06 21:14:24 +00:00
|
|
|
free_async_extent_pages(async_extent);
|
2012-03-12 15:03:00 +00:00
|
|
|
kfree(async_extent);
|
2013-02-06 21:49:15 +00:00
|
|
|
goto again;
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
|
|
|
|
u64 num_bytes)
|
|
|
|
{
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
|
|
|
|
read_lock(&em_tree->lock);
|
|
|
|
em = search_extent_mapping(em_tree, start, num_bytes);
|
|
|
|
if (em) {
|
|
|
|
/*
|
|
|
|
* if block start isn't an actual block number then find the
|
|
|
|
* first block in this inode and use that as a hint. If that
|
|
|
|
* block is also bogus then just don't worry about it.
|
|
|
|
*/
|
|
|
|
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = search_extent_mapping(em_tree, 0, 0);
|
|
|
|
if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
if (em)
|
|
|
|
free_extent_map(em);
|
|
|
|
} else {
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
free_extent_map(em);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
read_unlock(&em_tree->lock);
|
|
|
|
|
|
|
|
return alloc_hint;
|
|
|
|
}
|
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
/*
|
|
|
|
* when extent_io.c finds a delayed allocation range in the file,
|
|
|
|
* the call backs end up in this code. The basic idea is to
|
|
|
|
* allocate extents on disk for the range, and create ordered data structs
|
|
|
|
* in ram to track those extents.
|
|
|
|
*
|
|
|
|
* locked_page is the page that writepage had locked already. We use
|
|
|
|
* it to make sure we don't do extra locks or unlocks.
|
|
|
|
*
|
|
|
|
* *page_started is set to one if we unlock locked_page and do everything
|
|
|
|
* required to start IO on it. It may be clean and already done with
|
|
|
|
* IO when we return.
|
|
|
|
*/
|
2013-08-14 18:02:47 +00:00
|
|
|
static noinline int cow_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
2016-07-11 03:05:29 +00:00
|
|
|
u64 start, u64 end, u64 delalloc_end,
|
|
|
|
int *page_started, unsigned long *nr_written,
|
|
|
|
int unlock, struct btrfs_dedupe_hash *hash)
|
2008-11-07 03:02:51 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2013-08-14 18:02:47 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-11-07 03:02:51 +00:00
|
|
|
u64 alloc_hint = 0;
|
|
|
|
u64 num_bytes;
|
|
|
|
unsigned long ram_size;
|
2017-03-06 23:04:20 +00:00
|
|
|
u64 cur_alloc_size = 0;
|
2016-06-22 22:54:23 +00:00
|
|
|
u64 blocksize = fs_info->sectorsize;
|
2008-11-07 03:02:51 +00:00
|
|
|
struct btrfs_key ins;
|
|
|
|
struct extent_map *em;
|
2017-03-06 23:04:20 +00:00
|
|
|
unsigned clear_bits;
|
|
|
|
unsigned long page_ops;
|
|
|
|
bool extent_reserved = false;
|
2008-11-07 03:02:51 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
2017-02-20 11:50:35 +00:00
|
|
|
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
|
2013-10-25 20:19:08 +00:00
|
|
|
WARN_ON_ONCE(1);
|
2014-02-07 17:21:23 +00:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
2013-10-25 20:19:08 +00:00
|
|
|
}
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2013-02-26 08:10:22 +00:00
|
|
|
num_bytes = ALIGN(end - start + 1, blocksize);
|
2008-11-07 03:02:51 +00:00
|
|
|
num_bytes = max(blocksize, num_bytes);
|
2018-02-15 10:07:59 +00:00
|
|
|
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2017-02-20 11:50:43 +00:00
|
|
|
inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
|
2011-05-24 19:35:30 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
if (start == 0) {
|
|
|
|
/* lets try to make an inline extent */
|
2018-03-02 07:43:15 +00:00
|
|
|
ret = cow_file_range_inline(inode, start, end, 0,
|
|
|
|
BTRFS_COMPRESS_NONE, NULL);
|
2008-11-07 03:02:51 +00:00
|
|
|
if (ret == 0) {
|
2017-10-19 18:15:55 +00:00
|
|
|
/*
|
|
|
|
* We use DO_ACCOUNTING here because we need the
|
|
|
|
* delalloc_release_metadata to be run _after_ we drop
|
|
|
|
* our outstanding extent for clearing delalloc for this
|
|
|
|
* range.
|
|
|
|
*/
|
2016-07-19 08:50:36 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end,
|
|
|
|
delalloc_end, NULL,
|
2013-07-29 15:20:47 +00:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2017-10-19 18:15:55 +00:00
|
|
|
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
|
|
|
|
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
|
2013-07-29 15:20:47 +00:00
|
|
|
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
2008-11-07 03:02:51 +00:00
|
|
|
*nr_written = *nr_written +
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
(end - start + PAGE_SIZE) / PAGE_SIZE;
|
2008-11-07 03:02:51 +00:00
|
|
|
*page_started = 1;
|
|
|
|
goto out;
|
2012-03-12 15:03:00 +00:00
|
|
|
} else if (ret < 0) {
|
|
|
|
goto out_unlock;
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), start,
|
|
|
|
start + num_bytes - 1, 0);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2018-02-15 04:29:38 +00:00
|
|
|
while (num_bytes > 0) {
|
|
|
|
cur_alloc_size = num_bytes;
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize, 0, alloc_hint,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
&ins, 1, 1);
|
2013-08-14 18:02:47 +00:00
|
|
|
if (ret < 0)
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out_unlock;
|
2017-03-06 23:04:20 +00:00
|
|
|
cur_alloc_size = ins.offset;
|
|
|
|
extent_reserved = true;
|
2009-01-06 02:25:51 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
ram_size = ins.offset;
|
2017-01-31 15:50:22 +00:00
|
|
|
em = create_io_em(inode, start, ins.offset, /* len */
|
|
|
|
start, /* orig_start */
|
|
|
|
ins.objectid, /* block_start */
|
|
|
|
ins.offset, /* block_len */
|
|
|
|
ins.offset, /* orig_block_len */
|
|
|
|
ram_size, /* ram_bytes */
|
|
|
|
BTRFS_COMPRESS_NONE, /* compress_type */
|
2017-02-13 23:35:09 +00:00
|
|
|
BTRFS_ORDERED_REGULAR /* type */);
|
2018-05-30 08:48:56 +00:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
ret = PTR_ERR(em);
|
2013-04-22 10:53:47 +00:00
|
|
|
goto out_reserve;
|
2018-05-30 08:48:56 +00:00
|
|
|
}
|
2017-01-31 15:50:22 +00:00
|
|
|
free_extent_map(em);
|
2008-07-17 16:53:50 +00:00
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
|
2008-11-07 03:02:51 +00:00
|
|
|
ram_size, cur_alloc_size, 0);
|
2013-04-22 10:53:47 +00:00
|
|
|
if (ret)
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 09:43:00 +00:00
|
|
|
goto out_drop_extent_cache;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2008-12-12 15:03:38 +00:00
|
|
|
if (root->root_key.objectid ==
|
|
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
|
|
|
ret = btrfs_reloc_clone_csums(inode, start,
|
|
|
|
cur_alloc_size);
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 02:25:51 +00:00
|
|
|
/*
|
|
|
|
* Only drop cache here, and process as normal.
|
|
|
|
*
|
|
|
|
* We must not allow extent_clear_unlock_delalloc()
|
|
|
|
* at out_unlock label to free meta of this ordered
|
|
|
|
* extent, as its meta should be freed by
|
|
|
|
* btrfs_finish_ordered_io().
|
|
|
|
*
|
|
|
|
* So we must continue until @start is increased to
|
|
|
|
* skip current ordered extent.
|
|
|
|
*/
|
2013-08-14 18:02:47 +00:00
|
|
|
if (ret)
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 02:25:51 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), start,
|
|
|
|
start + ram_size - 1, 0);
|
2008-12-12 15:03:38 +00:00
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 14:39:32 +00:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
/* we're not doing compressed IO, don't unlock the first
|
|
|
|
* page (which the caller expects to stay locked), don't
|
|
|
|
* clear any dirty bits and don't set any writeback bits
|
2009-09-02 20:53:46 +00:00
|
|
|
*
|
|
|
|
* Do set the Private2 bit so we know this page was properly
|
|
|
|
* setup for writepage
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
*/
|
2017-03-06 23:04:20 +00:00
|
|
|
page_ops = unlock ? PAGE_UNLOCK : 0;
|
|
|
|
page_ops |= PAGE_SET_PRIVATE2;
|
2009-10-08 15:27:10 +00:00
|
|
|
|
2013-07-29 15:20:47 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, start,
|
2016-07-19 08:50:36 +00:00
|
|
|
start + ram_size - 1,
|
|
|
|
delalloc_end, locked_page,
|
2013-07-29 15:20:47 +00:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC,
|
2017-03-06 23:04:20 +00:00
|
|
|
page_ops);
|
2018-02-15 04:29:38 +00:00
|
|
|
if (num_bytes < cur_alloc_size)
|
|
|
|
num_bytes = 0;
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 02:25:51 +00:00
|
|
|
else
|
2018-02-15 04:29:38 +00:00
|
|
|
num_bytes -= cur_alloc_size;
|
2007-12-18 01:14:04 +00:00
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
start += cur_alloc_size;
|
2017-03-06 23:04:20 +00:00
|
|
|
extent_reserved = false;
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 02:25:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* btrfs_reloc_clone_csums() error, since start is increased
|
|
|
|
* extent_clear_unlock_delalloc() at out_unlock label won't
|
|
|
|
* free metadata of current ordered extent, we're OK to exit.
|
|
|
|
*/
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
2012-03-12 15:03:00 +00:00
|
|
|
out:
|
2007-12-18 01:14:01 +00:00
|
|
|
return ret;
|
2012-11-01 07:32:18 +00:00
|
|
|
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 09:43:00 +00:00
|
|
|
out_drop_extent_cache:
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
|
2013-04-22 10:53:47 +00:00
|
|
|
out_reserve:
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
|
2012-03-12 15:03:00 +00:00
|
|
|
out_unlock:
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
|
|
|
|
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
|
2017-03-06 23:04:20 +00:00
|
|
|
page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK;
|
|
|
|
/*
|
|
|
|
* If we reserved an extent for our delalloc range (or a subrange) and
|
|
|
|
* failed to create the respective ordered extent, then it means that
|
|
|
|
* when we reserved the extent we decremented the extent's size from
|
|
|
|
* the data space_info's bytes_may_use counter and incremented the
|
|
|
|
* space_info's bytes_reserved counter by the same amount. We must make
|
|
|
|
* sure extent_clear_unlock_delalloc() does not try to decrement again
|
|
|
|
* the data space_info's bytes_may_use counter, therefore we do not pass
|
|
|
|
* it the flag EXTENT_CLEAR_DATA_RESV.
|
|
|
|
*/
|
|
|
|
if (extent_reserved) {
|
|
|
|
extent_clear_unlock_delalloc(inode, start,
|
|
|
|
start + cur_alloc_size,
|
|
|
|
start + cur_alloc_size,
|
|
|
|
locked_page,
|
|
|
|
clear_bits,
|
|
|
|
page_ops);
|
|
|
|
start += cur_alloc_size;
|
|
|
|
if (start >= end)
|
|
|
|
goto out;
|
|
|
|
}
|
2016-07-19 08:50:36 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
|
|
|
|
locked_page,
|
2017-03-06 23:04:20 +00:00
|
|
|
clear_bits | EXTENT_CLEAR_DATA_RESV,
|
|
|
|
page_ops);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out;
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
/*
|
|
|
|
* work queue call back to started compression on a file and pages
|
|
|
|
*/
|
|
|
|
static noinline void async_cow_start(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
int num_added = 0;
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
|
|
|
|
compress_file_range(async_cow->inode, async_cow->locked_page,
|
|
|
|
async_cow->start, async_cow->end, async_cow,
|
|
|
|
&num_added);
|
2012-06-08 19:16:12 +00:00
|
|
|
if (num_added == 0) {
|
2012-06-15 18:19:48 +00:00
|
|
|
btrfs_add_delayed_iput(async_cow->inode);
|
2008-11-07 03:02:51 +00:00
|
|
|
async_cow->inode = NULL;
|
2012-06-08 19:16:12 +00:00
|
|
|
}
|
2008-11-07 03:02:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* work queue call back to submit previously compressed pages
|
|
|
|
*/
|
|
|
|
static noinline void async_cow_submit(struct btrfs_work *work)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info;
|
2008-11-07 03:02:51 +00:00
|
|
|
struct async_cow *async_cow;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
|
|
|
|
root = async_cow->root;
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info = root->fs_info;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
|
|
|
|
PAGE_SHIFT;
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2018-02-26 15:15:17 +00:00
|
|
|
/* atomic_sub_return implies a barrier */
|
2016-06-22 22:54:23 +00:00
|
|
|
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
|
2018-02-26 15:15:17 +00:00
|
|
|
5 * SZ_1M)
|
|
|
|
cond_wake_up_nomb(&fs_info->async_submit_wait);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2009-01-06 02:25:51 +00:00
|
|
|
if (async_cow->inode)
|
2008-11-07 03:02:51 +00:00
|
|
|
submit_compressed_extents(async_cow->inode, async_cow);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
static noinline void async_cow_free(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
2012-06-08 19:16:12 +00:00
|
|
|
if (async_cow->inode)
|
2012-06-15 18:19:48 +00:00
|
|
|
btrfs_add_delayed_iput(async_cow->inode);
|
2008-11-07 03:02:51 +00:00
|
|
|
kfree(async_cow);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cow_file_range_async(struct inode *inode, struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
2017-10-24 05:18:16 +00:00
|
|
|
unsigned long *nr_written,
|
|
|
|
unsigned int write_flags)
|
2008-11-07 03:02:51 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-11-07 03:02:51 +00:00
|
|
|
struct async_cow *async_cow;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
u64 cur_end;
|
|
|
|
|
2009-10-08 16:30:20 +00:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
|
2017-10-31 15:37:52 +00:00
|
|
|
1, 0, NULL);
|
2009-01-06 02:25:51 +00:00
|
|
|
while (start < end) {
|
2008-11-07 03:02:51 +00:00
|
|
|
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(!async_cow); /* -ENOMEM */
|
2012-06-08 19:16:12 +00:00
|
|
|
async_cow->inode = igrab(inode);
|
2008-11-07 03:02:51 +00:00
|
|
|
async_cow->root = root;
|
|
|
|
async_cow->locked_page = locked_page;
|
|
|
|
async_cow->start = start;
|
2017-10-24 05:18:16 +00:00
|
|
|
async_cow->write_flags = write_flags;
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2014-07-17 03:44:09 +00:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
|
2016-06-22 22:54:23 +00:00
|
|
|
!btrfs_test_opt(fs_info, FORCE_COMPRESS))
|
2008-11-07 03:02:51 +00:00
|
|
|
cur_end = end;
|
|
|
|
else
|
2015-12-14 16:42:10 +00:00
|
|
|
cur_end = min(end, start + SZ_512K - 1);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
|
|
|
async_cow->end = cur_end;
|
|
|
|
INIT_LIST_HEAD(&async_cow->extents);
|
|
|
|
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
btrfs_init_work(&async_cow->work,
|
|
|
|
btrfs_delalloc_helper,
|
|
|
|
async_cow_start, async_cow_submit,
|
|
|
|
async_cow_free);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
nr_pages = (cur_end - start + PAGE_SIZE) >>
|
|
|
|
PAGE_SHIFT;
|
2016-06-22 22:54:23 +00:00
|
|
|
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
|
2008-11-07 03:02:51 +00:00
|
|
|
|
|
|
|
*nr_written += nr_pages;
|
|
|
|
start = cur_end + 1;
|
|
|
|
}
|
|
|
|
*page_started = 1;
|
|
|
|
return 0;
|
2007-12-18 01:14:01 +00:00
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
|
2008-12-12 15:03:38 +00:00
|
|
|
u64 bytenr, u64 num_bytes)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_ordered_sum *sums;
|
|
|
|
LIST_HEAD(list);
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
|
2011-03-08 13:14:00 +00:00
|
|
|
bytenr + num_bytes - 1, &list, 0);
|
2008-12-12 15:03:38 +00:00
|
|
|
if (ret == 0 && list_empty(&list))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (!list_empty(&list)) {
|
|
|
|
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
|
|
|
|
list_del(&sums->list);
|
|
|
|
kfree(sums);
|
|
|
|
}
|
2018-02-01 00:09:13 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2008-12-12 15:03:38 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* when nowcow writeback call back. This checks for snapshots or COW copies
|
|
|
|
* of the extents that exist in the file, and COWs the file as required.
|
|
|
|
*
|
|
|
|
* If no cow copies or snapshots exist, we write directly to the existing
|
|
|
|
* blocks on disk
|
|
|
|
*/
|
2009-03-13 00:12:45 +00:00
|
|
|
static noinline int run_delalloc_nocow(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
2008-11-07 03:02:51 +00:00
|
|
|
u64 start, u64 end, int *page_started, int force,
|
|
|
|
unsigned long *nr_written)
|
2007-12-18 01:14:01 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-12-18 01:14:01 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_path *path;
|
2008-10-30 18:20:02 +00:00
|
|
|
struct btrfs_file_extent_item *fi;
|
2007-12-18 01:14:01 +00:00
|
|
|
struct btrfs_key found_key;
|
2017-01-31 15:50:22 +00:00
|
|
|
struct extent_map *em;
|
2008-10-30 18:20:02 +00:00
|
|
|
u64 cow_start;
|
|
|
|
u64 cur_offset;
|
|
|
|
u64 extent_end;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
u64 extent_offset;
|
2008-10-30 18:20:02 +00:00
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 num_bytes;
|
2012-12-03 15:31:19 +00:00
|
|
|
u64 disk_num_bytes;
|
2013-04-04 18:31:27 +00:00
|
|
|
u64 ram_bytes;
|
2008-10-30 18:20:02 +00:00
|
|
|
int extent_type;
|
2012-03-12 15:03:00 +00:00
|
|
|
int ret, err;
|
2008-10-30 18:25:28 +00:00
|
|
|
int type;
|
2008-10-30 18:20:02 +00:00
|
|
|
int nocow;
|
|
|
|
int check_prev = 1;
|
2011-04-20 02:33:24 +00:00
|
|
|
bool nolock;
|
2017-01-10 18:35:31 +00:00
|
|
|
u64 ino = btrfs_ino(BTRFS_I(inode));
|
2007-12-18 01:14:01 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2012-05-31 19:58:55 +00:00
|
|
|
if (!path) {
|
2016-07-19 08:50:36 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, end,
|
|
|
|
locked_page,
|
2013-07-29 15:20:47 +00:00
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2013-07-29 17:22:24 +00:00
|
|
|
EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DEFRAG, PAGE_UNLOCK |
|
2013-07-29 15:20:47 +00:00
|
|
|
PAGE_CLEAR_DIRTY |
|
|
|
|
PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 17:38:47 +00:00
|
|
|
return -ENOMEM;
|
2012-05-31 19:58:55 +00:00
|
|
|
}
|
2011-04-20 02:33:24 +00:00
|
|
|
|
2017-02-20 11:50:35 +00:00
|
|
|
nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
|
2011-04-20 02:33:24 +00:00
|
|
|
|
2008-10-30 18:20:02 +00:00
|
|
|
cow_start = (u64)-1;
|
|
|
|
cur_offset = start;
|
|
|
|
while (1) {
|
2017-01-30 20:25:28 +00:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
|
2008-10-30 18:20:02 +00:00
|
|
|
cur_offset, 0);
|
2013-10-25 20:55:08 +00:00
|
|
|
if (ret < 0)
|
2012-03-12 15:03:00 +00:00
|
|
|
goto error;
|
2008-10-30 18:20:02 +00:00
|
|
|
if (ret > 0 && path->slots[0] > 0 && check_prev) {
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key,
|
|
|
|
path->slots[0] - 1);
|
2011-04-20 02:31:50 +00:00
|
|
|
if (found_key.objectid == ino &&
|
2008-10-30 18:20:02 +00:00
|
|
|
found_key.type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
check_prev = 0;
|
|
|
|
next_slot:
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
2018-01-25 18:02:50 +00:00
|
|
|
if (ret < 0) {
|
|
|
|
if (cow_start != (u64)-1)
|
|
|
|
cur_offset = cow_start;
|
2012-03-12 15:03:00 +00:00
|
|
|
goto error;
|
2018-01-25 18:02:50 +00:00
|
|
|
}
|
2008-10-30 18:20:02 +00:00
|
|
|
if (ret > 0)
|
|
|
|
break;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
}
|
2007-12-18 01:14:01 +00:00
|
|
|
|
2008-10-30 18:20:02 +00:00
|
|
|
nocow = 0;
|
|
|
|
disk_bytenr = 0;
|
2008-12-12 15:03:38 +00:00
|
|
|
num_bytes = 0;
|
2008-10-30 18:20:02 +00:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
If we are using the NO_HOLES feature, we have a tiny time window when
running delalloc for a nodatacow inode where we can race with a concurrent
link or xattr add operation leading to a BUG_ON.
This happens because at run_delalloc_nocow() we end up casting a leaf item
of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
file extent item (struct btrfs_file_extent_item) and then analyse its
extent type field, which won't match any of the expected extent types
(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
explicit BUG_ON(1).
The following sequence diagram shows how the race happens when running a
no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
neighbour leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
slot N - 2 slot N - 1 slot 0
(Note the implicit hole for inode 257 regarding the [0, 8K[ range)
CPU 1 CPU 2
run_dealloc_nocow()
btrfs_lookup_file_extent()
--> searches for a key with value
(257 EXTENT_DATA 4096) in the
fs/subvol tree
--> returns us a path with
path->nodes[0] == leaf X and
path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it
calls btrfs_next_leaf()
btrfs_next_leaf()
--> releases the path
hard link added to our inode,
with key (257 INODE_REF 500)
added to the end of leaf X,
so leaf X now has N + 1 keys
--> searches for the key
(257 INODE_REF 256), because
it was the last key in leaf X
before it released the path,
with path->keep_locks set to 1
--> ends up at leaf X again and
it verifies that the key
(257 INODE_REF 256) is no longer
the last key in the leaf, so it
returns with path->nodes[0] ==
leaf X and path->slots[0] == N,
pointing to the new item with
key (257 INODE_REF 500)
the loop iteration of run_dealloc_nocow()
does not break out the loop and continues
because the key referenced in the path
at path->nodes[0] and path->slots[0] is
for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
and its offset (500) is less then our delalloc
range's end (8192)
the item pointed by the path, an inode reference item,
is (incorrectly) interpreted as a file extent item and
we get an invalid extent type, leading to the BUG_ON(1):
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
(...)
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
(...)
} else {
BUG_ON(1)
}
The same can happen if a xattr is added concurrently and ends up having
a key with an offset smaller then the delalloc's range end.
So fix this by skipping keys with a type smaller than
BTRFS_EXTENT_DATA_KEY.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 00:33:58 +00:00
|
|
|
if (found_key.objectid > ino)
|
|
|
|
break;
|
|
|
|
if (WARN_ON_ONCE(found_key.objectid < ino) ||
|
|
|
|
found_key.type < BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
|
|
|
}
|
|
|
|
if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
|
2008-10-30 18:20:02 +00:00
|
|
|
found_key.offset > end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.offset > cur_offset) {
|
|
|
|
extent_end = found_key.offset;
|
2009-10-09 13:57:45 +00:00
|
|
|
extent_type = 0;
|
2008-10-30 18:20:02 +00:00
|
|
|
goto out_check;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
|
2013-04-04 18:31:27 +00:00
|
|
|
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
|
2008-10-30 18:25:28 +00:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2008-10-30 18:20:02 +00:00
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
extent_offset = btrfs_file_extent_offset(leaf, fi);
|
2008-10-30 18:20:02 +00:00
|
|
|
extent_end = found_key.offset +
|
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2012-12-03 15:31:19 +00:00
|
|
|
disk_num_bytes =
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf, fi);
|
2008-10-30 18:20:02 +00:00
|
|
|
if (extent_end <= start) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
|
|
|
}
|
2008-12-12 15:03:38 +00:00
|
|
|
if (disk_bytenr == 0)
|
|
|
|
goto out_check;
|
2008-10-30 18:20:02 +00:00
|
|
|
if (btrfs_file_extent_compression(leaf, fi) ||
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) ||
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
goto out_check;
|
2018-05-17 06:58:29 +00:00
|
|
|
/*
|
|
|
|
* Do the same check as in btrfs_cross_ref_exist but
|
|
|
|
* without the unnecessary search.
|
|
|
|
*/
|
|
|
|
if (btrfs_file_extent_generation(leaf, fi) <=
|
|
|
|
btrfs_root_last_snapshot(&root->root_item))
|
|
|
|
goto out_check;
|
2008-10-30 18:25:28 +00:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
|
|
|
|
goto out_check;
|
2016-06-22 22:54:24 +00:00
|
|
|
if (btrfs_extent_readonly(fs_info, disk_bytenr))
|
2008-10-30 18:20:02 +00:00
|
|
|
goto out_check;
|
2018-02-01 00:09:13 +00:00
|
|
|
ret = btrfs_cross_ref_exist(root, ino,
|
|
|
|
found_key.offset -
|
|
|
|
extent_offset, disk_bytenr);
|
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* ret could be -EIO if the above fails to read
|
|
|
|
* metadata.
|
|
|
|
*/
|
|
|
|
if (ret < 0) {
|
|
|
|
if (cow_start != (u64)-1)
|
|
|
|
cur_offset = cow_start;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON_ONCE(nolock);
|
2008-12-12 15:03:38 +00:00
|
|
|
goto out_check;
|
2018-02-01 00:09:13 +00:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
disk_bytenr += extent_offset;
|
2008-12-12 15:03:38 +00:00
|
|
|
disk_bytenr += cur_offset - found_key.offset;
|
|
|
|
num_bytes = min(end + 1, extent_end) - cur_offset;
|
2014-03-27 03:12:25 +00:00
|
|
|
/*
|
|
|
|
* if there are pending snapshots for this root,
|
|
|
|
* we fall into common COW way.
|
|
|
|
*/
|
|
|
|
if (!nolock) {
|
2017-06-22 00:19:11 +00:00
|
|
|
err = btrfs_start_write_no_snapshotting(root);
|
2014-03-27 03:12:25 +00:00
|
|
|
if (!err)
|
|
|
|
goto out_check;
|
|
|
|
}
|
2008-12-12 15:03:38 +00:00
|
|
|
/*
|
|
|
|
* force cow if csum exists in the range.
|
|
|
|
* this ensure that csum for a given extent are
|
|
|
|
* either valid or do not exist.
|
|
|
|
*/
|
2018-02-01 00:09:13 +00:00
|
|
|
ret = csum_exist_in_range(fs_info, disk_bytenr,
|
|
|
|
num_bytes);
|
|
|
|
if (ret) {
|
2016-10-07 02:01:29 +00:00
|
|
|
if (!nolock)
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2018-02-01 00:09:13 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ret could be -EIO if the above fails to read
|
|
|
|
* metadata.
|
|
|
|
*/
|
|
|
|
if (ret < 0) {
|
|
|
|
if (cow_start != (u64)-1)
|
|
|
|
cur_offset = cow_start;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
WARN_ON_ONCE(nolock);
|
2008-12-12 15:03:38 +00:00
|
|
|
goto out_check;
|
2016-10-07 02:01:29 +00:00
|
|
|
}
|
|
|
|
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
|
|
|
|
if (!nolock)
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2016-05-09 12:15:41 +00:00
|
|
|
goto out_check;
|
2016-10-07 02:01:29 +00:00
|
|
|
}
|
2008-10-30 18:20:02 +00:00
|
|
|
nocow = 1;
|
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
extent_end = found_key.offset +
|
2018-06-06 07:41:49 +00:00
|
|
|
btrfs_file_extent_ram_bytes(leaf, fi);
|
2016-06-15 13:22:56 +00:00
|
|
|
extent_end = ALIGN(extent_end,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize);
|
2008-10-30 18:20:02 +00:00
|
|
|
} else {
|
|
|
|
BUG_ON(1);
|
|
|
|
}
|
|
|
|
out_check:
|
|
|
|
if (extent_end <= start) {
|
|
|
|
path->slots[0]++;
|
2014-03-27 03:12:25 +00:00
|
|
|
if (!nolock && nocow)
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2016-05-09 12:15:41 +00:00
|
|
|
if (nocow)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
|
2008-10-30 18:20:02 +00:00
|
|
|
goto next_slot;
|
|
|
|
}
|
|
|
|
if (!nocow) {
|
|
|
|
if (cow_start == (u64)-1)
|
|
|
|
cow_start = cur_offset;
|
|
|
|
cur_offset = extent_end;
|
|
|
|
if (cur_offset > end)
|
|
|
|
break;
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
2008-08-05 17:05:02 +00:00
|
|
|
}
|
|
|
|
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 18:20:02 +00:00
|
|
|
if (cow_start != (u64)-1) {
|
2013-08-14 18:02:47 +00:00
|
|
|
ret = cow_file_range(inode, locked_page,
|
|
|
|
cow_start, found_key.offset - 1,
|
2016-07-11 03:05:29 +00:00
|
|
|
end, page_started, nr_written, 1,
|
|
|
|
NULL);
|
2014-03-27 03:12:25 +00:00
|
|
|
if (ret) {
|
|
|
|
if (!nolock && nocow)
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2016-05-09 12:15:41 +00:00
|
|
|
if (nocow)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_nocow_writers(fs_info,
|
2016-05-09 12:15:41 +00:00
|
|
|
disk_bytenr);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto error;
|
2014-03-27 03:12:25 +00:00
|
|
|
}
|
2008-10-30 18:20:02 +00:00
|
|
|
cow_start = (u64)-1;
|
2008-08-05 17:05:02 +00:00
|
|
|
}
|
2008-10-30 18:20:02 +00:00
|
|
|
|
2008-10-30 18:25:28 +00:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2017-01-31 15:50:22 +00:00
|
|
|
u64 orig_start = found_key.offset - extent_offset;
|
|
|
|
|
|
|
|
em = create_io_em(inode, cur_offset, num_bytes,
|
|
|
|
orig_start,
|
|
|
|
disk_bytenr, /* block_start */
|
|
|
|
num_bytes, /* block_len */
|
|
|
|
disk_num_bytes, /* orig_block_len */
|
|
|
|
ram_bytes, BTRFS_COMPRESS_NONE,
|
|
|
|
BTRFS_ORDERED_PREALLOC);
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
if (!nolock && nocow)
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2017-01-31 15:50:22 +00:00
|
|
|
if (nocow)
|
|
|
|
btrfs_dec_nocow_writers(fs_info,
|
|
|
|
disk_bytenr);
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
goto error;
|
2008-10-30 18:25:28 +00:00
|
|
|
}
|
2017-01-31 15:50:22 +00:00
|
|
|
free_extent_map(em);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2008-10-30 18:25:28 +00:00
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
} else {
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
|
|
|
}
|
2008-10-30 18:20:02 +00:00
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
|
2008-10-30 18:25:28 +00:00
|
|
|
num_bytes, num_bytes, type);
|
2016-05-09 12:15:41 +00:00
|
|
|
if (nocow)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2008-11-07 03:02:51 +00:00
|
|
|
|
2010-05-16 14:49:59 +00:00
|
|
|
if (root->root_key.objectid ==
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 02:25:51 +00:00
|
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID)
|
|
|
|
/*
|
|
|
|
* Error handled later, as we must prevent
|
|
|
|
* extent_clear_unlock_delalloc() in error handler
|
|
|
|
* from freeing metadata of created ordered extent.
|
|
|
|
*/
|
2010-05-16 14:49:59 +00:00
|
|
|
ret = btrfs_reloc_clone_csums(inode, cur_offset,
|
|
|
|
num_bytes);
|
|
|
|
|
2013-07-29 15:20:47 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, cur_offset,
|
2016-07-19 08:50:36 +00:00
|
|
|
cur_offset + num_bytes - 1, end,
|
2013-07-29 15:20:47 +00:00
|
|
|
locked_page, EXTENT_LOCKED |
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
EXTENT_DELALLOC |
|
|
|
|
EXTENT_CLEAR_DATA_RESV,
|
|
|
|
PAGE_UNLOCK | PAGE_SET_PRIVATE2);
|
|
|
|
|
2014-03-27 03:12:25 +00:00
|
|
|
if (!nolock && nocow)
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2008-10-30 18:20:02 +00:00
|
|
|
cur_offset = extent_end;
|
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG]
When btrfs_reloc_clone_csum() reports error, it can underflow metadata
and leads to kernel assertion on outstanding extents in
run_delalloc_nocow() and cow_file_range().
BTRFS info (device vdb5): relocating block group 12582912 flags data
BTRFS info (device vdb5): found 1 extents
assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858
Currently, due to another bug blocking ordered extents, the bug is only
reproducible under certain block group layout and using error injection.
a) Create one data block group with one 4K extent in it.
To avoid the bug that hangs btrfs due to ordered extent which never
finishes
b) Make btrfs_reloc_clone_csum() always fail
c) Relocate that block group
[CAUSE]
run_delalloc_nocow() and cow_file_range() handles error from
btrfs_reloc_clone_csum() wrongly:
(The ascii chart shows a more generic case of this bug other than the
bug mentioned above)
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- cleanup range --------------->|
|<----------- ----------->|
\/
btrfs_finish_ordered_io() range
So error handler, which calls extent_clear_unlock_delalloc() with
EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io()
will both cover OE n, and free its metadata, causing metadata under flow.
[Fix]
The fix is to ensure after calling btrfs_add_ordered_extent(), we only
call error handler after increasing the iteration offset, so that
cleanup range won't cover any created ordered extent.
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<----------- ----------->|<---------- cleanup range --------->|
\/
btrfs_finish_ordered_io() range
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2017-03-08 02:25:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* btrfs_reloc_clone_csums() error, now we're OK to call error
|
|
|
|
* handler, as metadata for created ordered extent will only
|
|
|
|
* be freed by btrfs_finish_ordered_io().
|
|
|
|
*/
|
|
|
|
if (ret)
|
|
|
|
goto error;
|
2008-10-30 18:20:02 +00:00
|
|
|
if (cur_offset > end)
|
|
|
|
break;
|
2007-12-18 01:14:01 +00:00
|
|
|
}
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 18:20:02 +00:00
|
|
|
|
2012-05-31 19:58:55 +00:00
|
|
|
if (cur_offset <= end && cow_start == (u64)-1) {
|
2008-10-30 18:20:02 +00:00
|
|
|
cow_start = cur_offset;
|
2012-05-31 19:58:55 +00:00
|
|
|
cur_offset = end;
|
|
|
|
}
|
|
|
|
|
2008-10-30 18:20:02 +00:00
|
|
|
if (cow_start != (u64)-1) {
|
2016-07-11 03:05:29 +00:00
|
|
|
ret = cow_file_range(inode, locked_page, cow_start, end, end,
|
|
|
|
page_started, nr_written, 1, NULL);
|
2013-10-25 20:55:08 +00:00
|
|
|
if (ret)
|
2012-03-12 15:03:00 +00:00
|
|
|
goto error;
|
2008-10-30 18:20:02 +00:00
|
|
|
}
|
|
|
|
|
2012-03-12 15:03:00 +00:00
|
|
|
error:
|
2012-05-31 19:58:55 +00:00
|
|
|
if (ret && cur_offset < end)
|
2016-07-19 08:50:36 +00:00
|
|
|
extent_clear_unlock_delalloc(inode, cur_offset, end, end,
|
2013-07-29 15:20:47 +00:00
|
|
|
locked_page, EXTENT_LOCKED |
|
2013-07-29 17:22:24 +00:00
|
|
|
EXTENT_DELALLOC | EXTENT_DEFRAG |
|
|
|
|
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
|
|
|
|
PAGE_CLEAR_DIRTY |
|
2013-07-29 15:20:47 +00:00
|
|
|
PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
2008-08-05 17:05:02 +00:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 15:03:00 +00:00
|
|
|
return ret;
|
2007-12-18 01:14:01 +00:00
|
|
|
}
|
|
|
|
|
2014-07-03 10:22:07 +00:00
|
|
|
static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* @defrag_bytes is a hint value, no spinlock held here,
|
|
|
|
* if is not zero, it means the file is defragging.
|
|
|
|
* Force cow if given extent needs to be defragged.
|
|
|
|
*/
|
|
|
|
if (BTRFS_I(inode)->defrag_bytes &&
|
|
|
|
test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
|
|
|
|
EXTENT_DEFRAG, 0, NULL))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* extent_io.c call back to do delayed allocation processing
|
|
|
|
*/
|
2017-05-05 15:57:13 +00:00
|
|
|
static int run_delalloc_range(void *private_data, struct page *locked_page,
|
2008-11-07 03:02:51 +00:00
|
|
|
u64 start, u64 end, int *page_started,
|
2017-10-24 05:18:16 +00:00
|
|
|
unsigned long *nr_written,
|
|
|
|
struct writeback_control *wbc)
|
2007-12-18 01:14:01 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2007-12-18 01:14:01 +00:00
|
|
|
int ret;
|
2014-07-03 10:22:07 +00:00
|
|
|
int force_cow = need_force_cow(inode, start, end);
|
2017-10-24 05:18:16 +00:00
|
|
|
unsigned int write_flags = wbc_to_write_flags(wbc);
|
2008-06-25 20:01:30 +00:00
|
|
|
|
2014-07-03 10:22:07 +00:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2009-01-06 02:25:51 +00:00
|
|
|
page_started, 1, nr_written);
|
2014-07-03 10:22:07 +00:00
|
|
|
} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
|
2008-10-30 18:25:28 +00:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2009-01-06 02:25:51 +00:00
|
|
|
page_started, 0, nr_written);
|
2017-07-17 13:52:58 +00:00
|
|
|
} else if (!inode_need_compress(inode, start, end)) {
|
2016-07-11 03:05:29 +00:00
|
|
|
ret = cow_file_range(inode, locked_page, start, end, end,
|
|
|
|
page_started, nr_written, 1, NULL);
|
2012-06-08 19:26:47 +00:00
|
|
|
} else {
|
|
|
|
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2008-11-07 03:02:51 +00:00
|
|
|
ret = cow_file_range_async(inode, locked_page, start, end,
|
2017-10-24 05:18:16 +00:00
|
|
|
page_started, nr_written,
|
|
|
|
write_flags);
|
2012-06-08 19:26:47 +00:00
|
|
|
}
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
if (ret)
|
|
|
|
btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
|
2007-08-27 20:49:44 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-05-05 15:57:13 +00:00
|
|
|
static void btrfs_split_extent_hook(void *private_data,
|
2011-07-21 16:56:09 +00:00
|
|
|
struct extent_state *orig, u64 split)
|
2009-09-11 20:12:44 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2015-02-11 20:08:59 +00:00
|
|
|
u64 size;
|
|
|
|
|
2010-05-16 14:48:47 +00:00
|
|
|
/* not delalloc, ignore it */
|
2009-09-11 20:12:44 +00:00
|
|
|
if (!(orig->state & EXTENT_DELALLOC))
|
2011-07-21 16:56:09 +00:00
|
|
|
return;
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2015-02-11 20:08:59 +00:00
|
|
|
size = orig->end - orig->start + 1;
|
|
|
|
if (size > BTRFS_MAX_EXTENT_SIZE) {
|
2017-01-04 10:09:51 +00:00
|
|
|
u32 num_extents;
|
2015-02-11 20:08:59 +00:00
|
|
|
u64 new_size;
|
|
|
|
|
|
|
|
/*
|
2015-03-13 19:01:24 +00:00
|
|
|
* See the explanation in btrfs_merge_extent_hook, the same
|
|
|
|
* applies here, just in reverse.
|
2015-02-11 20:08:59 +00:00
|
|
|
*/
|
|
|
|
new_size = orig->end - split + 1;
|
2017-01-04 10:09:51 +00:00
|
|
|
num_extents = count_max_extents(new_size);
|
2015-03-13 19:01:24 +00:00
|
|
|
new_size = split - orig->start;
|
2017-01-04 10:09:51 +00:00
|
|
|
num_extents += count_max_extents(new_size);
|
|
|
|
if (count_max_extents(size) >= num_extents)
|
2015-02-11 20:08:59 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2017-10-19 18:15:55 +00:00
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2009-09-11 20:12:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* extent_io.c merge_extent_hook, used to track merged delayed allocation
|
|
|
|
* extents so we can keep track of new extents that are just merged onto old
|
|
|
|
* extents, such as when we are doing sequential writes, so we can properly
|
|
|
|
* account for the metadata space we'll need.
|
|
|
|
*/
|
2017-05-05 15:57:13 +00:00
|
|
|
static void btrfs_merge_extent_hook(void *private_data,
|
2011-07-21 16:56:09 +00:00
|
|
|
struct extent_state *new,
|
|
|
|
struct extent_state *other)
|
2009-09-11 20:12:44 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2015-02-11 20:08:59 +00:00
|
|
|
u64 new_size, old_size;
|
2017-01-04 10:09:51 +00:00
|
|
|
u32 num_extents;
|
2015-02-11 20:08:59 +00:00
|
|
|
|
2009-09-11 20:12:44 +00:00
|
|
|
/* not delalloc, ignore it */
|
|
|
|
if (!(other->state & EXTENT_DELALLOC))
|
2011-07-21 16:56:09 +00:00
|
|
|
return;
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2015-03-13 19:12:08 +00:00
|
|
|
if (new->start > other->start)
|
|
|
|
new_size = new->end - other->start + 1;
|
|
|
|
else
|
|
|
|
new_size = other->end - new->start + 1;
|
2015-02-11 20:08:59 +00:00
|
|
|
|
|
|
|
/* we're not bigger than the max, unreserve the space and go */
|
|
|
|
if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2017-10-19 18:15:55 +00:00
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
|
2015-02-11 20:08:59 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-03-13 19:01:24 +00:00
|
|
|
* We have to add up either side to figure out how many extents were
|
|
|
|
* accounted for before we merged into one big extent. If the number of
|
|
|
|
* extents we accounted for is <= the amount we need for the new range
|
|
|
|
* then we can return, otherwise drop. Think of it like this
|
|
|
|
*
|
|
|
|
* [ 4k][MAX_SIZE]
|
|
|
|
*
|
|
|
|
* So we've grown the extent by a MAX_SIZE extent, this would mean we
|
|
|
|
* need 2 outstanding extents, on one side we have 1 and the other side
|
|
|
|
* we have 1 so they are == and we can return. But in this case
|
|
|
|
*
|
|
|
|
* [MAX_SIZE+4k][MAX_SIZE+4k]
|
|
|
|
*
|
|
|
|
* Each range on their own accounts for 2 extents, but merged together
|
|
|
|
* they are only 3 extents worth of accounting, so we need to drop in
|
|
|
|
* this case.
|
2015-02-11 20:08:59 +00:00
|
|
|
*/
|
2015-03-13 19:01:24 +00:00
|
|
|
old_size = other->end - other->start + 1;
|
2017-01-04 10:09:51 +00:00
|
|
|
num_extents = count_max_extents(old_size);
|
2015-03-13 19:01:24 +00:00
|
|
|
old_size = new->end - new->start + 1;
|
2017-01-04 10:09:51 +00:00
|
|
|
num_extents += count_max_extents(old_size);
|
|
|
|
if (count_max_extents(new_size) >= num_extents)
|
2015-02-11 20:08:59 +00:00
|
|
|
return;
|
|
|
|
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2017-10-19 18:15:55 +00:00
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2009-09-11 20:12:44 +00:00
|
|
|
}
|
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
|
|
|
|
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
|
|
|
|
&root->delalloc_inodes);
|
|
|
|
set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
root->nr_delalloc_inodes++;
|
|
|
|
if (root->nr_delalloc_inodes == 1) {
|
2016-06-22 22:54:23 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
BUG_ON(!list_empty(&root->delalloc_root));
|
|
|
|
list_add_tail(&root->delalloc_root,
|
2016-06-22 22:54:23 +00:00
|
|
|
&fs_info->delalloc_roots);
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
}
|
|
|
|
|
2018-04-27 09:21:51 +00:00
|
|
|
|
|
|
|
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
|
|
|
|
struct btrfs_inode *inode)
|
2013-05-15 07:48:22 +00:00
|
|
|
{
|
2018-06-29 08:56:42 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2016-06-22 22:54:23 +00:00
|
|
|
|
2017-02-20 11:51:07 +00:00
|
|
|
if (!list_empty(&inode->delalloc_inodes)) {
|
|
|
|
list_del_init(&inode->delalloc_inodes);
|
2013-05-15 07:48:22 +00:00
|
|
|
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2017-02-20 11:51:07 +00:00
|
|
|
&inode->runtime_flags);
|
2013-05-15 07:48:22 +00:00
|
|
|
root->nr_delalloc_inodes--;
|
|
|
|
if (!root->nr_delalloc_inodes) {
|
2018-04-27 09:21:52 +00:00
|
|
|
ASSERT(list_empty(&root->delalloc_inodes));
|
2016-06-22 22:54:23 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
BUG_ON(list_empty(&root->delalloc_root));
|
|
|
|
list_del_init(&root->delalloc_root);
|
2016-06-22 22:54:23 +00:00
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
}
|
2018-04-27 09:21:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_del_delalloc_inode(struct btrfs_root *root,
|
|
|
|
struct btrfs_inode *inode)
|
|
|
|
{
|
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
__btrfs_del_delalloc_inode(root, inode);
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* extent_io.c set_bit_hook, used to track delayed allocation
|
|
|
|
* bytes in this file, and to maintain the list of inodes that
|
|
|
|
* have pending delalloc work to be done.
|
|
|
|
*/
|
2017-05-05 15:57:13 +00:00
|
|
|
static void btrfs_set_bit_hook(void *private_data,
|
2015-01-14 18:52:13 +00:00
|
|
|
struct extent_state *state, unsigned *bits)
|
2008-01-29 20:55:23 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
|
2014-07-03 10:22:07 +00:00
|
|
|
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
|
|
|
|
WARN_ON(1);
|
2008-12-15 20:54:40 +00:00
|
|
|
/*
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-20 20:20:32 +00:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-15 20:54:40 +00:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
*/
|
2010-05-16 14:48:47 +00:00
|
|
|
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
|
2008-01-29 20:55:23 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 14:48:47 +00:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2017-10-19 18:15:55 +00:00
|
|
|
u32 num_extents = count_max_extents(len);
|
2017-02-20 11:50:35 +00:00
|
|
|
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2017-10-19 18:15:55 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2010-03-19 18:07:23 +00:00
|
|
|
|
2015-03-16 21:38:52 +00:00
|
|
|
/* For sanity tests */
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2015-03-16 21:38:52 +00:00
|
|
|
return;
|
|
|
|
|
2017-06-20 18:01:20 +00:00
|
|
|
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
|
|
|
|
fs_info->delalloc_batch);
|
2013-01-29 10:11:59 +00:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2010-05-16 14:48:47 +00:00
|
|
|
BTRFS_I(inode)->delalloc_bytes += len;
|
2014-07-03 10:22:07 +00:00
|
|
|
if (*bits & EXTENT_DEFRAG)
|
|
|
|
BTRFS_I(inode)->defrag_bytes += len;
|
2013-01-29 10:11:59 +00:00
|
|
|
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2013-05-15 07:48:22 +00:00
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
|
|
|
btrfs_add_delalloc_inodes(root, inode);
|
2013-01-29 10:11:59 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2008-01-29 20:55:23 +00:00
|
|
|
}
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
|
|
|
|
if (!(state->state & EXTENT_DELALLOC_NEW) &&
|
|
|
|
(*bits & EXTENT_DELALLOC_NEW)) {
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
|
|
|
|
state->start;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
}
|
2008-01-29 20:55:23 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* extent_io.c clear_bit_hook, see set_bit_hook for why
|
|
|
|
*/
|
2017-05-05 15:57:13 +00:00
|
|
|
static void btrfs_clear_bit_hook(void *private_data,
|
2013-04-29 13:38:46 +00:00
|
|
|
struct extent_state *state,
|
2015-01-14 18:52:13 +00:00
|
|
|
unsigned *bits)
|
2008-01-29 20:55:23 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
|
2017-02-20 11:51:03 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
|
2014-07-03 10:22:07 +00:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2017-01-04 10:09:51 +00:00
|
|
|
u32 num_extents = count_max_extents(len);
|
2014-07-03 10:22:07 +00:00
|
|
|
|
2017-07-27 18:52:55 +00:00
|
|
|
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
|
|
|
|
spin_lock(&inode->lock);
|
2017-02-20 11:51:03 +00:00
|
|
|
inode->defrag_bytes -= len;
|
2017-07-27 18:52:55 +00:00
|
|
|
spin_unlock(&inode->lock);
|
|
|
|
}
|
2014-07-03 10:22:07 +00:00
|
|
|
|
2008-12-15 20:54:40 +00:00
|
|
|
/*
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-20 20:20:32 +00:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-15 20:54:40 +00:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
*/
|
2010-05-16 14:48:47 +00:00
|
|
|
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
|
2017-02-20 11:51:03 +00:00
|
|
|
struct btrfs_root *root = inode->root;
|
2012-07-10 11:28:39 +00:00
|
|
|
bool do_list = !btrfs_is_free_space_inode(inode);
|
2008-04-22 17:26:47 +00:00
|
|
|
|
2017-10-19 18:15:55 +00:00
|
|
|
spin_lock(&inode->lock);
|
|
|
|
btrfs_mod_outstanding_extents(inode, -num_extents);
|
|
|
|
spin_unlock(&inode->lock);
|
2010-05-16 14:48:47 +00:00
|
|
|
|
2013-09-27 18:57:43 +00:00
|
|
|
/*
|
|
|
|
* We don't reserve metadata space for space cache inodes so we
|
|
|
|
* don't need to call dellalloc_release_metadata if there is an
|
|
|
|
* error.
|
|
|
|
*/
|
2017-03-06 23:04:20 +00:00
|
|
|
if (*bits & EXTENT_CLEAR_META_RESV &&
|
2016-06-22 22:54:23 +00:00
|
|
|
root != fs_info->tree_root)
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
btrfs_delalloc_release_metadata(inode, len, false);
|
2010-05-16 14:48:47 +00:00
|
|
|
|
2015-03-16 21:38:52 +00:00
|
|
|
/* For sanity tests. */
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2015-03-16 21:38:52 +00:00
|
|
|
return;
|
|
|
|
|
2017-03-06 23:04:20 +00:00
|
|
|
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
|
|
|
|
do_list && !(state->state & EXTENT_NORESERVE) &&
|
|
|
|
(*bits & EXTENT_CLEAR_DATA_RESV))
|
2017-02-20 11:51:03 +00:00
|
|
|
btrfs_free_reserved_data_space_noquota(
|
|
|
|
&inode->vfs_inode,
|
2015-10-08 10:19:37 +00:00
|
|
|
state->start, len);
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2017-06-20 18:01:20 +00:00
|
|
|
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
|
|
|
|
fs_info->delalloc_batch);
|
2017-02-20 11:51:03 +00:00
|
|
|
spin_lock(&inode->lock);
|
|
|
|
inode->delalloc_bytes -= len;
|
|
|
|
if (do_list && inode->delalloc_bytes == 0 &&
|
2013-01-29 10:11:59 +00:00
|
|
|
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2017-02-20 11:51:07 +00:00
|
|
|
&inode->runtime_flags))
|
2013-05-15 07:48:22 +00:00
|
|
|
btrfs_del_delalloc_inode(root, inode);
|
2017-02-20 11:51:03 +00:00
|
|
|
spin_unlock(&inode->lock);
|
2008-01-29 20:55:23 +00:00
|
|
|
}
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
|
|
|
|
if ((state->state & EXTENT_DELALLOC_NEW) &&
|
|
|
|
(*bits & EXTENT_DELALLOC_NEW)) {
|
|
|
|
spin_lock(&inode->lock);
|
|
|
|
ASSERT(inode->new_delalloc_bytes >= len);
|
|
|
|
inode->new_delalloc_bytes -= len;
|
|
|
|
spin_unlock(&inode->lock);
|
|
|
|
}
|
2008-01-29 20:55:23 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
2018-07-18 17:28:09 +00:00
|
|
|
* Merge bio hook, this must check the chunk tree to make sure we don't create
|
|
|
|
* bios that span stripes or chunks
|
2016-06-23 01:31:49 +00:00
|
|
|
*
|
|
|
|
* return 1 if page cannot be merged to bio
|
|
|
|
* return 0 if page can be merged to bio
|
|
|
|
* return error otherwise
|
2008-09-29 19:18:18 +00:00
|
|
|
*/
|
2016-06-05 19:31:54 +00:00
|
|
|
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
size_t size, struct bio *bio,
|
|
|
|
unsigned long bio_flags)
|
2008-03-24 19:02:07 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2013-10-11 22:44:27 +00:00
|
|
|
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
|
2008-03-24 19:02:07 +00:00
|
|
|
u64 length = 0;
|
|
|
|
u64 map_length;
|
|
|
|
int ret;
|
|
|
|
|
2008-11-07 03:02:51 +00:00
|
|
|
if (bio_flags & EXTENT_BIO_COMPRESSED)
|
|
|
|
return 0;
|
|
|
|
|
2013-10-11 22:44:27 +00:00
|
|
|
length = bio->bi_iter.bi_size;
|
2008-03-24 19:02:07 +00:00
|
|
|
map_length = length;
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
|
|
|
|
NULL, 0);
|
2016-06-23 01:31:49 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2009-01-06 02:25:51 +00:00
|
|
|
if (map_length < length + size)
|
2008-03-24 19:02:07 +00:00
|
|
|
return 1;
|
2011-10-04 03:23:13 +00:00
|
|
|
return 0;
|
2008-03-24 19:02:07 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
*
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
* are inserted into the btree
|
|
|
|
*/
|
2018-03-08 13:35:48 +00:00
|
|
|
static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
|
2010-05-25 13:48:28 +00:00
|
|
|
u64 bio_offset)
|
2008-02-20 17:07:25 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t ret = 0;
|
2008-04-16 15:15:20 +00:00
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 03:03:00 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2008-04-16 15:15:20 +00:00
|
|
|
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 03:03:00 +00:00
|
|
|
/*
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
*
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
* are inserted into the btree
|
|
|
|
*/
|
2018-07-18 15:36:24 +00:00
|
|
|
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
|
2018-03-08 12:47:33 +00:00
|
|
|
int mirror_num)
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 03:03:00 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t ret;
|
2012-11-05 17:51:52 +00:00
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
|
2015-07-20 13:29:37 +00:00
|
|
|
if (ret) {
|
2017-06-03 07:38:06 +00:00
|
|
|
bio->bi_status = ret;
|
2015-07-20 13:29:37 +00:00
|
|
|
bio_endio(bio);
|
|
|
|
}
|
2012-11-05 17:51:52 +00:00
|
|
|
return ret;
|
2008-04-16 15:14:51 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
2008-12-17 19:51:42 +00:00
|
|
|
* extent_io.c submission hook. This does the right thing for csum calculation
|
2017-11-01 23:19:27 +00:00
|
|
|
* on write, or reading the csums from the tree before a read.
|
|
|
|
*
|
|
|
|
* Rules about async/sync submit,
|
|
|
|
* a) read: sync submit
|
|
|
|
*
|
|
|
|
* b) write without checksum: sync submit
|
|
|
|
*
|
|
|
|
* c) write with checksum:
|
|
|
|
* c-1) if bio is issued by fsync: sync submit
|
|
|
|
* (sync_writers != 0)
|
|
|
|
*
|
|
|
|
* c-2) if root is reloc root: sync submit
|
|
|
|
* (only in case of buffered IO)
|
|
|
|
*
|
|
|
|
* c-3) otherwise: async submit
|
2008-09-29 19:18:18 +00:00
|
|
|
*/
|
2017-07-05 23:41:23 +00:00
|
|
|
static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
|
2017-05-05 15:57:13 +00:00
|
|
|
int mirror_num, unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
2008-04-16 15:14:51 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-04-16 15:14:51 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2015-07-27 09:56:43 +00:00
|
|
|
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t ret = 0;
|
2008-10-30 18:23:13 +00:00
|
|
|
int skip_sum;
|
2012-11-16 18:56:32 +00:00
|
|
|
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
|
2008-04-16 15:14:51 +00:00
|
|
|
|
2009-04-17 08:37:41 +00:00
|
|
|
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
2008-12-17 19:51:42 +00:00
|
|
|
|
2017-02-20 11:50:35 +00:00
|
|
|
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
|
2015-07-27 09:56:43 +00:00
|
|
|
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
|
2011-10-04 03:23:12 +00:00
|
|
|
|
2016-06-05 19:31:52 +00:00
|
|
|
if (bio_op(bio) != REQ_OP_WRITE) {
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
|
2012-05-02 18:00:54 +00:00
|
|
|
if (ret)
|
2012-11-05 17:51:52 +00:00
|
|
|
goto out;
|
2012-05-02 18:00:54 +00:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 21:58:54 +00:00
|
|
|
if (bio_flags & EXTENT_BIO_COMPRESSED) {
|
2012-11-05 17:51:52 +00:00
|
|
|
ret = btrfs_submit_compressed_read(inode, bio,
|
|
|
|
mirror_num,
|
|
|
|
bio_flags);
|
|
|
|
goto out;
|
2011-03-01 06:48:31 +00:00
|
|
|
} else if (!skip_sum) {
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_lookup_bio_sums(inode, bio, NULL);
|
2011-03-01 06:48:31 +00:00
|
|
|
if (ret)
|
2012-11-05 17:51:52 +00:00
|
|
|
goto out;
|
2011-03-01 06:48:31 +00:00
|
|
|
}
|
2008-08-20 13:44:52 +00:00
|
|
|
goto mapit;
|
2012-11-16 18:56:32 +00:00
|
|
|
} else if (async && !skip_sum) {
|
2008-12-12 15:03:38 +00:00
|
|
|
/* csum items have already been cloned */
|
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
|
|
|
|
goto mapit;
|
2008-10-30 18:23:13 +00:00
|
|
|
/* we're doing a write, do the async checksumming */
|
2017-05-05 15:57:13 +00:00
|
|
|
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
|
|
|
|
bio_offset, inode,
|
2018-07-18 15:36:24 +00:00
|
|
|
btrfs_submit_bio_start);
|
2012-11-05 17:51:52 +00:00
|
|
|
goto out;
|
2012-11-16 18:56:32 +00:00
|
|
|
} else if (!skip_sum) {
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
|
2012-11-16 18:56:32 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-10-30 18:23:13 +00:00
|
|
|
}
|
|
|
|
|
2008-03-24 19:01:56 +00:00
|
|
|
mapit:
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
|
2012-11-05 17:51:52 +00:00
|
|
|
|
|
|
|
out:
|
2017-06-03 07:38:06 +00:00
|
|
|
if (ret) {
|
|
|
|
bio->bi_status = ret;
|
2015-07-20 13:29:37 +00:00
|
|
|
bio_endio(bio);
|
|
|
|
}
|
2012-11-05 17:51:52 +00:00
|
|
|
return ret;
|
2008-02-20 17:07:25 +00:00
|
|
|
}
|
2008-02-20 21:11:05 +00:00
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* given a list of ordered sums record them in the inode. This happens
|
|
|
|
* at IO completion time based on sums calculated at bio submission time.
|
|
|
|
*/
|
2008-07-17 16:54:15 +00:00
|
|
|
static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
|
2017-02-10 18:35:37 +00:00
|
|
|
struct inode *inode, struct list_head *list)
|
2008-07-17 16:53:50 +00:00
|
|
|
{
|
|
|
|
struct btrfs_ordered_sum *sum;
|
2018-01-08 08:59:43 +00:00
|
|
|
int ret;
|
2008-07-17 16:53:50 +00:00
|
|
|
|
2009-01-21 15:59:08 +00:00
|
|
|
list_for_each_entry(sum, list, list) {
|
2017-11-08 00:07:43 +00:00
|
|
|
trans->adding_csums = true;
|
2018-01-08 08:59:43 +00:00
|
|
|
ret = btrfs_csum_file_blocks(trans,
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 21:58:54 +00:00
|
|
|
BTRFS_I(inode)->root->fs_info->csum_root, sum);
|
2017-11-08 00:07:43 +00:00
|
|
|
trans->adding_csums = false;
|
2018-01-08 08:59:43 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2008-07-17 16:53:50 +00:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-02-03 19:33:23 +00:00
|
|
|
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
|
2017-11-04 00:16:59 +00:00
|
|
|
unsigned int extra_bits,
|
2016-07-19 08:50:36 +00:00
|
|
|
struct extent_state **cached_state, int dedupe)
|
2008-08-05 03:17:27 +00:00
|
|
|
{
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
WARN_ON((end & (PAGE_SIZE - 1)) == 0);
|
2008-08-05 03:17:27 +00:00
|
|
|
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
|
2017-11-04 00:16:59 +00:00
|
|
|
extra_bits, cached_state);
|
2008-08-05 03:17:27 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/* see btrfs_writepage_start_hook for details on why this is required */
|
2008-07-17 16:53:51 +00:00
|
|
|
struct btrfs_writepage_fixup {
|
|
|
|
struct page *page;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
2008-12-02 14:54:17 +00:00
|
|
|
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
|
2008-07-17 16:53:51 +00:00
|
|
|
{
|
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2017-02-27 07:10:38 +00:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2008-07-17 16:53:51 +00:00
|
|
|
struct page *page;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 page_start;
|
|
|
|
u64 page_end;
|
2012-02-15 15:23:57 +00:00
|
|
|
int ret;
|
2008-07-17 16:53:51 +00:00
|
|
|
|
|
|
|
fixup = container_of(work, struct btrfs_writepage_fixup, work);
|
|
|
|
page = fixup->page;
|
2008-07-21 14:29:44 +00:00
|
|
|
again:
|
2008-07-17 16:53:51 +00:00
|
|
|
lock_page(page);
|
|
|
|
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto out_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = page->mapping->host;
|
|
|
|
page_start = page_offset(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
page_end = page_offset(page) + PAGE_SIZE - 1;
|
2008-07-17 16:53:51 +00:00
|
|
|
|
2015-12-03 13:30:40 +00:00
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2012-03-01 13:57:19 +00:00
|
|
|
&cached_state);
|
2008-07-21 14:29:44 +00:00
|
|
|
|
|
|
|
/* already ordered? We're done */
|
2009-09-02 20:53:46 +00:00
|
|
|
if (PagePrivate2(page))
|
2008-07-17 16:53:51 +00:00
|
|
|
goto out;
|
2008-07-21 14:29:44 +00:00
|
|
|
|
2017-02-20 11:50:49 +00:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
PAGE_SIZE);
|
2008-07-21 14:29:44 +00:00
|
|
|
if (ordered) {
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
|
2017-12-12 20:43:52 +00:00
|
|
|
page_end, &cached_state);
|
2008-07-21 14:29:44 +00:00
|
|
|
unlock_page(page);
|
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2012-02-15 15:23:57 +00:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
2008-07-21 14:29:44 +00:00
|
|
|
goto again;
|
|
|
|
}
|
2008-07-17 16:53:51 +00:00
|
|
|
|
2017-02-27 07:10:38 +00:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
PAGE_SIZE);
|
2012-02-15 15:23:57 +00:00
|
|
|
if (ret) {
|
|
|
|
mapping_set_error(page->mapping, ret);
|
|
|
|
end_extent_writepage(page, ret, page_start, page_end);
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-12-05 07:29:19 +00:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
|
|
|
|
&cached_state, 0);
|
|
|
|
if (ret) {
|
|
|
|
mapping_set_error(page->mapping, ret);
|
|
|
|
end_extent_writepage(page, ret, page_start, page_end);
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-07-17 16:53:51 +00:00
|
|
|
ClearPageChecked(page);
|
2012-02-15 15:23:57 +00:00
|
|
|
set_page_dirty(page);
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
|
2008-07-17 16:53:51 +00:00
|
|
|
out:
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached_state);
|
2008-07-17 16:53:51 +00:00
|
|
|
out_page:
|
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
2011-01-26 08:19:22 +00:00
|
|
|
kfree(fixup);
|
2017-02-27 07:10:38 +00:00
|
|
|
extent_changeset_free(data_reserved);
|
2008-07-17 16:53:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are a few paths in the higher layers of the kernel that directly
|
|
|
|
* set the page dirty bit without asking the filesystem if it is a
|
|
|
|
* good idea. This causes problems because we want to make sure COW
|
|
|
|
* properly happens and the data=ordered rules are followed.
|
|
|
|
*
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
* In our case any range that doesn't have the ORDERED bit set
|
2008-07-17 16:53:51 +00:00
|
|
|
* hasn't been properly setup for IO. We kick off an async process
|
|
|
|
* to fix it up. The async helper will wait for ordered extents, set
|
|
|
|
* the delalloc bit and make it safe to write the page.
|
|
|
|
*/
|
2008-12-02 14:54:17 +00:00
|
|
|
static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
|
2008-07-17 16:53:51 +00:00
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-07-17 16:53:51 +00:00
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
|
2009-09-02 20:53:46 +00:00
|
|
|
/* this page is properly in the ordered list */
|
|
|
|
if (TestClearPagePrivate2(page))
|
2008-07-17 16:53:51 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (PageChecked(page))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
|
|
|
|
if (!fixup)
|
|
|
|
return -EAGAIN;
|
2008-07-22 15:18:09 +00:00
|
|
|
|
2008-07-17 16:53:51 +00:00
|
|
|
SetPageChecked(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
get_page(page);
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
btrfs_init_work(&fixup->work, btrfs_fixup_helper,
|
|
|
|
btrfs_writepage_fixup_worker, NULL, NULL);
|
2008-07-17 16:53:51 +00:00
|
|
|
fixup->page = page;
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
|
2012-02-15 15:23:57 +00:00
|
|
|
return -EBUSY;
|
2008-07-17 16:53:51 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 18:25:28 +00:00
|
|
|
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *inode, u64 file_pos,
|
|
|
|
u64 disk_bytenr, u64 disk_num_bytes,
|
|
|
|
u64 num_bytes, u64 ram_bytes,
|
|
|
|
u8 compression, u8 encryption,
|
|
|
|
u16 other_encoding, int extent_type)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key ins;
|
btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quotas being enabled
[BUG]
Under the following case, we can underflow qgroup reserved space.
Task A | Task B
---------------------------------------------------------------
Quota disabled |
Buffered write |
|- btrfs_check_data_free_space() |
| *NO* qgroup space is reserved |
| since quota is *DISABLED* |
|- All pages are copied to page |
cache |
| Enable quota
| Quota scan finished
|
| Sync_fs
| |- run_delalloc_range
| |- Write pages
| |- btrfs_finish_ordered_io
| |- insert_reserved_file_extent
| |- btrfs_qgroup_release_data()
| Since no qgroup space is
reserved in Task A, we
underflow qgroup reserved
space
This can be detected by fstest btrfs/104.
[CAUSE]
In insert_reserved_file_extent() we tell qgroup to release the @ram_bytes
size of qgroup reserved_space in all cases.
And btrfs_qgroup_release_data() will check if quotas are enabled.
However in the above case, the buffered write happens before quota is
enabled, so we don't have the reserved space for that range.
[FIX]
In insert_reserved_file_extent(), we tell qgroup to release the acctual
byte number it released.
In the above case, since we don't have the reserved space, we tell
qgroups to release 0 byte, so the problem can be fixed.
And thanks to the @reserved parameter introduced by the qgroup rework,
and previous patch to return released bytes, the fix can be as small as
10 lines.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
[ changelog updates ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:37 +00:00
|
|
|
u64 qg_released;
|
2014-01-07 11:42:27 +00:00
|
|
|
int extent_inserted = 0;
|
2008-10-30 18:25:28 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 17:38:47 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2008-10-30 18:25:28 +00:00
|
|
|
|
2009-09-11 16:27:37 +00:00
|
|
|
/*
|
|
|
|
* we may be replacing one extent in the tree with another.
|
|
|
|
* The new extent is pinned in the extent map, and we don't want
|
|
|
|
* to drop it from the cache until it is completely in the btree.
|
|
|
|
*
|
|
|
|
* So, tell btrfs_drop_extents to leave this extent in the cache.
|
|
|
|
* the caller is expected to unpin it and allow it to be merged
|
|
|
|
* with the others.
|
|
|
|
*/
|
2014-01-07 11:42:27 +00:00
|
|
|
ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
|
|
|
|
file_pos + num_bytes, NULL, 0,
|
|
|
|
1, sizeof(*fi), &extent_inserted);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-10-30 18:25:28 +00:00
|
|
|
|
2014-01-07 11:42:27 +00:00
|
|
|
if (!extent_inserted) {
|
2017-01-10 18:35:31 +00:00
|
|
|
ins.objectid = btrfs_ino(BTRFS_I(inode));
|
2014-01-07 11:42:27 +00:00
|
|
|
ins.offset = file_pos;
|
|
|
|
ins.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
|
|
|
|
path->leave_spinning = 1;
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &ins,
|
|
|
|
sizeof(*fi));
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
2008-10-30 18:25:28 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, fi, extent_type);
|
|
|
|
btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
|
|
|
|
btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
|
|
|
|
btrfs_set_file_extent_offset(leaf, fi, 0);
|
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
|
|
|
|
btrfs_set_file_extent_compression(leaf, fi, compression);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, fi, encryption);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
|
2009-03-13 15:00:37 +00:00
|
|
|
|
2008-10-30 18:25:28 +00:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2012-09-25 19:26:16 +00:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 18:25:28 +00:00
|
|
|
|
|
|
|
inode_add_bytes(inode, num_bytes);
|
|
|
|
|
|
|
|
ins.objectid = disk_bytenr;
|
|
|
|
ins.offset = disk_num_bytes;
|
|
|
|
ins.type = BTRFS_EXTENT_ITEM_KEY;
|
btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quotas being enabled
[BUG]
Under the following case, we can underflow qgroup reserved space.
Task A | Task B
---------------------------------------------------------------
Quota disabled |
Buffered write |
|- btrfs_check_data_free_space() |
| *NO* qgroup space is reserved |
| since quota is *DISABLED* |
|- All pages are copied to page |
cache |
| Enable quota
| Quota scan finished
|
| Sync_fs
| |- run_delalloc_range
| |- Write pages
| |- btrfs_finish_ordered_io
| |- insert_reserved_file_extent
| |- btrfs_qgroup_release_data()
| Since no qgroup space is
reserved in Task A, we
underflow qgroup reserved
space
This can be detected by fstest btrfs/104.
[CAUSE]
In insert_reserved_file_extent() we tell qgroup to release the @ram_bytes
size of qgroup reserved_space in all cases.
And btrfs_qgroup_release_data() will check if quotas are enabled.
However in the above case, the buffered write happens before quota is
enabled, so we don't have the reserved space for that range.
[FIX]
In insert_reserved_file_extent(), we tell qgroup to release the acctual
byte number it released.
In the above case, since we don't have the reserved space, we tell
qgroups to release 0 byte, so the problem can be fixed.
And thanks to the @reserved parameter introduced by the qgroup rework,
and previous patch to return released bytes, the fix can be as small as
10 lines.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
[ changelog updates ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:37 +00:00
|
|
|
|
2015-09-08 09:08:37 +00:00
|
|
|
/*
|
2015-10-26 06:11:18 +00:00
|
|
|
* Release the reserved range from inode dirty range map, as it is
|
|
|
|
* already moved into delayed_ref_head
|
2015-09-08 09:08:37 +00:00
|
|
|
*/
|
btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quotas being enabled
[BUG]
Under the following case, we can underflow qgroup reserved space.
Task A | Task B
---------------------------------------------------------------
Quota disabled |
Buffered write |
|- btrfs_check_data_free_space() |
| *NO* qgroup space is reserved |
| since quota is *DISABLED* |
|- All pages are copied to page |
cache |
| Enable quota
| Quota scan finished
|
| Sync_fs
| |- run_delalloc_range
| |- Write pages
| |- btrfs_finish_ordered_io
| |- insert_reserved_file_extent
| |- btrfs_qgroup_release_data()
| Since no qgroup space is
reserved in Task A, we
underflow qgroup reserved
space
This can be detected by fstest btrfs/104.
[CAUSE]
In insert_reserved_file_extent() we tell qgroup to release the @ram_bytes
size of qgroup reserved_space in all cases.
And btrfs_qgroup_release_data() will check if quotas are enabled.
However in the above case, the buffered write happens before quota is
enabled, so we don't have the reserved space for that range.
[FIX]
In insert_reserved_file_extent(), we tell qgroup to release the acctual
byte number it released.
In the above case, since we don't have the reserved space, we tell
qgroups to release 0 byte, so the problem can be fixed.
And thanks to the @reserved parameter introduced by the qgroup rework,
and previous patch to return released bytes, the fix can be as small as
10 lines.
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
[ changelog updates ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:37 +00:00
|
|
|
ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
qg_released = ret;
|
2017-09-29 19:43:49 +00:00
|
|
|
ret = btrfs_alloc_reserved_file_extent(trans, root,
|
|
|
|
btrfs_ino(BTRFS_I(inode)),
|
|
|
|
file_pos, qg_released, &ins);
|
2012-03-12 15:03:00 +00:00
|
|
|
out:
|
2008-10-30 18:25:28 +00:00
|
|
|
btrfs_free_path(path);
|
2009-03-13 15:00:37 +00:00
|
|
|
|
2012-03-12 15:03:00 +00:00
|
|
|
return ret;
|
2008-10-30 18:25:28 +00:00
|
|
|
}
|
|
|
|
|
2013-01-29 03:18:40 +00:00
|
|
|
/* snapshot-aware defrag */
|
|
|
|
struct sa_defrag_extent_backref {
|
|
|
|
struct rb_node node;
|
|
|
|
struct old_sa_defrag_extent *old;
|
|
|
|
u64 root_id;
|
|
|
|
u64 inum;
|
|
|
|
u64 file_pos;
|
|
|
|
u64 extent_offset;
|
|
|
|
u64 num_bytes;
|
|
|
|
u64 generation;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct old_sa_defrag_extent {
|
|
|
|
struct list_head list;
|
|
|
|
struct new_sa_defrag_extent *new;
|
|
|
|
|
|
|
|
u64 extent_offset;
|
|
|
|
u64 bytenr;
|
|
|
|
u64 offset;
|
|
|
|
u64 len;
|
|
|
|
int count;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct new_sa_defrag_extent {
|
|
|
|
struct rb_root root;
|
|
|
|
struct list_head head;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 file_pos;
|
|
|
|
u64 len;
|
|
|
|
u64 bytenr;
|
|
|
|
u64 disk_len;
|
|
|
|
u8 compress_type;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int backref_comp(struct sa_defrag_extent_backref *b1,
|
|
|
|
struct sa_defrag_extent_backref *b2)
|
|
|
|
{
|
|
|
|
if (b1->root_id < b2->root_id)
|
|
|
|
return -1;
|
|
|
|
else if (b1->root_id > b2->root_id)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (b1->inum < b2->inum)
|
|
|
|
return -1;
|
|
|
|
else if (b1->inum > b2->inum)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (b1->file_pos < b2->file_pos)
|
|
|
|
return -1;
|
|
|
|
else if (b1->file_pos > b2->file_pos)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* [------------------------------] ===> (a range of space)
|
|
|
|
* |<--->| |<---->| =============> (fs/file tree A)
|
|
|
|
* |<---------------------------->| ===> (fs/file tree B)
|
|
|
|
*
|
|
|
|
* A range of space can refer to two file extents in one tree while
|
|
|
|
* refer to only one file extent in another tree.
|
|
|
|
*
|
|
|
|
* So we may process a disk offset more than one time(two extents in A)
|
|
|
|
* and locate at the same extent(one extent in B), then insert two same
|
|
|
|
* backrefs(both refer to the extent in B).
|
|
|
|
*/
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void backref_insert(struct rb_root *root,
|
|
|
|
struct sa_defrag_extent_backref *backref)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &root->rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct sa_defrag_extent_backref *entry;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
|
|
|
|
|
|
|
|
ret = backref_comp(backref, entry);
|
|
|
|
if (ret < 0)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&backref->node, parent, p);
|
|
|
|
rb_insert_color(&backref->node, root);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note the backref might has changed, and in this case we just return 0.
|
|
|
|
*/
|
|
|
|
static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
|
|
|
|
void *ctx)
|
|
|
|
{
|
|
|
|
struct btrfs_file_extent_item *extent;
|
|
|
|
struct old_sa_defrag_extent *old = ctx;
|
|
|
|
struct new_sa_defrag_extent *new = old->new;
|
|
|
|
struct btrfs_path *path = new->path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct sa_defrag_extent_backref *backref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct inode *inode = new->inode;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2013-01-29 03:18:40 +00:00
|
|
|
int slot;
|
|
|
|
int ret;
|
|
|
|
u64 extent_offset;
|
|
|
|
u64 num_bytes;
|
|
|
|
|
|
|
|
if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
|
2017-01-10 18:35:31 +00:00
|
|
|
inum == btrfs_ino(BTRFS_I(inode)))
|
2013-01-29 03:18:40 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
key.objectid = root_id;
|
|
|
|
key.type = BTRFS_ROOT_ITEM_KEY;
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
root = btrfs_read_fs_root_no_name(fs_info, &key);
|
|
|
|
if (IS_ERR(root)) {
|
|
|
|
if (PTR_ERR(root) == -ENOENT)
|
|
|
|
return 0;
|
|
|
|
WARN_ON(1);
|
2016-09-20 14:05:02 +00:00
|
|
|
btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
|
2013-01-29 03:18:40 +00:00
|
|
|
inum, offset, root_id);
|
|
|
|
return PTR_ERR(root);
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = inum;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
if (offset > (u64)-1 << 32)
|
|
|
|
key.offset = 0;
|
|
|
|
else
|
|
|
|
key.offset = offset;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
2013-10-31 05:00:08 +00:00
|
|
|
if (WARN_ON(ret < 0))
|
2013-01-29 03:18:40 +00:00
|
|
|
return ret;
|
2013-07-22 16:50:37 +00:00
|
|
|
ret = 0;
|
2013-01-29 03:18:40 +00:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
|
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->slots[0]++;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
|
|
|
|
if (key.objectid > inum)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(leaf, slot,
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
|
|
|
|
continue;
|
|
|
|
|
2013-07-01 14:13:26 +00:00
|
|
|
/*
|
|
|
|
* 'offset' refers to the exact key.offset,
|
|
|
|
* NOT the 'offset' field in btrfs_extent_data_ref, ie.
|
|
|
|
* (key.offset - extent_offset).
|
|
|
|
*/
|
|
|
|
if (key.offset != offset)
|
2013-01-29 03:18:40 +00:00
|
|
|
continue;
|
|
|
|
|
2013-07-01 14:13:26 +00:00
|
|
|
extent_offset = btrfs_file_extent_offset(leaf, extent);
|
2013-01-29 03:18:40 +00:00
|
|
|
num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
|
2013-07-01 14:13:26 +00:00
|
|
|
|
2013-01-29 03:18:40 +00:00
|
|
|
if (extent_offset >= old->extent_offset + old->offset +
|
|
|
|
old->len || extent_offset + num_bytes <=
|
|
|
|
old->extent_offset + old->offset)
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
backref = kmalloc(sizeof(*backref), GFP_NOFS);
|
|
|
|
if (!backref) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
backref->root_id = root_id;
|
|
|
|
backref->inum = inum;
|
2013-07-01 14:13:26 +00:00
|
|
|
backref->file_pos = offset;
|
2013-01-29 03:18:40 +00:00
|
|
|
backref->num_bytes = num_bytes;
|
|
|
|
backref->extent_offset = extent_offset;
|
|
|
|
backref->generation = btrfs_file_extent_generation(leaf, extent);
|
|
|
|
backref->old = old;
|
|
|
|
backref_insert(&new->root, backref);
|
|
|
|
old->count++;
|
|
|
|
out:
|
|
|
|
btrfs_release_path(path);
|
|
|
|
WARN_ON(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline bool record_extent_backrefs(struct btrfs_path *path,
|
|
|
|
struct new_sa_defrag_extent *new)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
|
2013-01-29 03:18:40 +00:00
|
|
|
struct old_sa_defrag_extent *old, *tmp;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
new->path = path;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(old, tmp, &new->head, list) {
|
2013-07-01 14:13:26 +00:00
|
|
|
ret = iterate_inodes_from_logical(old->bytenr +
|
|
|
|
old->extent_offset, fs_info,
|
2013-01-29 03:18:40 +00:00
|
|
|
path, record_one_backref,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 17:58:45 +00:00
|
|
|
old, false);
|
2013-11-05 16:11:40 +00:00
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
return false;
|
2013-01-29 03:18:40 +00:00
|
|
|
|
|
|
|
/* no backref to be processed for this extent */
|
|
|
|
if (!old->count) {
|
|
|
|
list_del(&old->list);
|
|
|
|
kfree(old);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (list_empty(&new->head))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int relink_is_mergable(struct extent_buffer *leaf,
|
|
|
|
struct btrfs_file_extent_item *fi,
|
2013-08-02 08:30:40 +00:00
|
|
|
struct new_sa_defrag_extent *new)
|
2013-01-29 03:18:40 +00:00
|
|
|
{
|
2013-08-02 08:30:40 +00:00
|
|
|
if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
|
2013-01-29 03:18:40 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
|
|
|
|
return 0;
|
|
|
|
|
2013-08-02 08:30:40 +00:00
|
|
|
if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (btrfs_file_extent_encryption(leaf, fi) ||
|
2013-01-29 03:18:40 +00:00
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note the backref might has changed, and in this case we just return 0.
|
|
|
|
*/
|
|
|
|
static noinline int relink_extent_backref(struct btrfs_path *path,
|
|
|
|
struct sa_defrag_extent_backref *prev,
|
|
|
|
struct sa_defrag_extent_backref *backref)
|
|
|
|
{
|
|
|
|
struct btrfs_file_extent_item *extent;
|
|
|
|
struct btrfs_file_extent_item *item;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct old_sa_defrag_extent *old = backref->old;
|
|
|
|
struct new_sa_defrag_extent *new = old->new;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
|
2013-01-29 03:18:40 +00:00
|
|
|
struct inode *inode;
|
|
|
|
struct extent_state *cached = NULL;
|
|
|
|
int ret = 0;
|
|
|
|
u64 start;
|
|
|
|
u64 len;
|
|
|
|
u64 lock_start;
|
|
|
|
u64 lock_end;
|
|
|
|
bool merge = false;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
if (prev && prev->root_id == backref->root_id &&
|
|
|
|
prev->inum == backref->inum &&
|
|
|
|
prev->file_pos + prev->num_bytes == backref->file_pos)
|
|
|
|
merge = true;
|
|
|
|
|
|
|
|
/* step 1: get root */
|
|
|
|
key.objectid = backref->root_id;
|
|
|
|
key.type = BTRFS_ROOT_ITEM_KEY;
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
index = srcu_read_lock(&fs_info->subvol_srcu);
|
|
|
|
|
|
|
|
root = btrfs_read_fs_root_no_name(fs_info, &key);
|
|
|
|
if (IS_ERR(root)) {
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
if (PTR_ERR(root) == -ENOENT)
|
|
|
|
return 0;
|
|
|
|
return PTR_ERR(root);
|
|
|
|
}
|
|
|
|
|
2014-02-08 15:46:35 +00:00
|
|
|
if (btrfs_root_readonly(root)) {
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-29 03:18:40 +00:00
|
|
|
/* step 2: get inode */
|
|
|
|
key.objectid = backref->inum;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
|
|
|
|
/* step 3: relink backref */
|
|
|
|
lock_start = backref->file_pos;
|
|
|
|
lock_end = backref->file_pos + backref->num_bytes - 1;
|
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
|
2015-12-03 13:30:40 +00:00
|
|
|
&cached);
|
2013-01-29 03:18:40 +00:00
|
|
|
|
|
|
|
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
|
|
|
|
if (ordered) {
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = backref->inum;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = backref->file_pos;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out_free_path;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
if (btrfs_file_extent_generation(path->nodes[0], extent) !=
|
|
|
|
backref->generation)
|
|
|
|
goto out_free_path;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
start = backref->file_pos;
|
|
|
|
if (backref->extent_offset < old->extent_offset + old->offset)
|
|
|
|
start += old->extent_offset + old->offset -
|
|
|
|
backref->extent_offset;
|
|
|
|
|
|
|
|
len = min(backref->extent_offset + backref->num_bytes,
|
|
|
|
old->extent_offset + old->offset + old->len);
|
|
|
|
len -= max(backref->extent_offset, old->extent_offset + old->offset);
|
|
|
|
|
|
|
|
ret = btrfs_drop_extents(trans, root, inode, start,
|
|
|
|
start + len, 1);
|
|
|
|
if (ret)
|
|
|
|
goto out_free_path;
|
|
|
|
again:
|
2017-01-10 18:35:31 +00:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2013-01-29 03:18:40 +00:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = start;
|
|
|
|
|
2013-03-11 09:20:58 +00:00
|
|
|
path->leave_spinning = 1;
|
2013-01-29 03:18:40 +00:00
|
|
|
if (merge) {
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
u64 extent_len;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
|
2014-01-23 05:41:09 +00:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
|
2013-01-29 03:18:40 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_path;
|
|
|
|
|
|
|
|
path->slots[0]--;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
extent_len = btrfs_file_extent_num_bytes(leaf, fi);
|
|
|
|
|
2013-08-02 08:30:40 +00:00
|
|
|
if (extent_len + found_key.offset == start &&
|
|
|
|
relink_is_mergable(leaf, fi, new)) {
|
2013-01-29 03:18:40 +00:00
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi,
|
|
|
|
extent_len + len);
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
inode_add_bytes(inode, len);
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
goto out_free_path;
|
|
|
|
} else {
|
|
|
|
merge = false;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
sizeof(*extent));
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-01-29 03:18:40 +00:00
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
|
|
|
|
btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
|
|
|
|
btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
|
|
|
|
btrfs_set_file_extent_num_bytes(leaf, item, len);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
|
|
|
|
btrfs_set_file_extent_generation(leaf, item, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
|
|
|
|
btrfs_set_file_extent_compression(leaf, item, new->compress_type);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, item, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, item, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
inode_add_bytes(inode, len);
|
2013-03-11 09:20:58 +00:00
|
|
|
btrfs_release_path(path);
|
2013-01-29 03:18:40 +00:00
|
|
|
|
2017-09-29 19:43:49 +00:00
|
|
|
ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
|
2013-01-29 03:18:40 +00:00
|
|
|
new->disk_len, 0,
|
|
|
|
backref->root_id, backref->inum,
|
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 06:52:54 +00:00
|
|
|
new->file_pos); /* start - extent_offset */
|
2013-01-29 03:18:40 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-01-29 03:18:40 +00:00
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
out_free_path:
|
|
|
|
btrfs_release_path(path);
|
2013-03-11 09:20:58 +00:00
|
|
|
path->leave_spinning = 0;
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2013-01-29 03:18:40 +00:00
|
|
|
out_unlock:
|
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached);
|
2013-01-29 03:18:40 +00:00
|
|
|
iput(inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-29 02:45:05 +00:00
|
|
|
static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
|
|
|
|
{
|
|
|
|
struct old_sa_defrag_extent *old, *tmp;
|
|
|
|
|
|
|
|
if (!new)
|
|
|
|
return;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(old, tmp, &new->head, list) {
|
|
|
|
kfree(old);
|
|
|
|
}
|
|
|
|
kfree(new);
|
|
|
|
}
|
|
|
|
|
2013-01-29 03:18:40 +00:00
|
|
|
static void relink_file_extents(struct new_sa_defrag_extent *new)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
|
2013-01-29 03:18:40 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct sa_defrag_extent_backref *backref;
|
|
|
|
struct sa_defrag_extent_backref *prev = NULL;
|
|
|
|
struct inode *inode;
|
|
|
|
struct rb_node *node;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
inode = new->inode;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!record_extent_backrefs(path, new)) {
|
|
|
|
btrfs_free_path(path);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
node = rb_first(&new->root);
|
|
|
|
if (!node)
|
|
|
|
break;
|
|
|
|
rb_erase(node, &new->root);
|
|
|
|
|
|
|
|
backref = rb_entry(node, struct sa_defrag_extent_backref, node);
|
|
|
|
|
|
|
|
ret = relink_extent_backref(path, prev, backref);
|
|
|
|
WARN_ON(ret < 0);
|
|
|
|
|
|
|
|
kfree(prev);
|
|
|
|
|
|
|
|
if (ret == 1)
|
|
|
|
prev = backref;
|
|
|
|
else
|
|
|
|
prev = NULL;
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
kfree(prev);
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
out:
|
2013-10-29 02:45:05 +00:00
|
|
|
free_sa_defrag_extent(new);
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
atomic_dec(&fs_info->defrag_running);
|
|
|
|
wake_up(&fs_info->transaction_wait);
|
2013-01-29 03:18:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct new_sa_defrag_extent *
|
|
|
|
record_old_file_extents(struct inode *inode,
|
|
|
|
struct btrfs_ordered_extent *ordered)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2013-01-29 03:18:40 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2013-10-29 02:45:05 +00:00
|
|
|
struct old_sa_defrag_extent *old;
|
2013-01-29 03:18:40 +00:00
|
|
|
struct new_sa_defrag_extent *new;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
new = kmalloc(sizeof(*new), GFP_NOFS);
|
|
|
|
if (!new)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
new->inode = inode;
|
|
|
|
new->file_pos = ordered->file_offset;
|
|
|
|
new->len = ordered->len;
|
|
|
|
new->bytenr = ordered->start;
|
|
|
|
new->disk_len = ordered->disk_len;
|
|
|
|
new->compress_type = ordered->compress_type;
|
|
|
|
new->root = RB_ROOT;
|
|
|
|
INIT_LIST_HEAD(&new->head);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
goto out_kfree;
|
|
|
|
|
2017-01-10 18:35:31 +00:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2013-01-29 03:18:40 +00:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = new->file_pos;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_path;
|
|
|
|
if (ret > 0 && path->slots[0] > 0)
|
|
|
|
path->slots[0]--;
|
|
|
|
|
|
|
|
/* find out all the old extents for the file range */
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_file_extent_item *extent;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
int slot;
|
|
|
|
u64 num_bytes;
|
|
|
|
u64 offset;
|
|
|
|
u64 end;
|
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 extent_offset;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
|
|
|
|
if (slot >= btrfs_header_nritems(l)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
2013-10-29 02:45:05 +00:00
|
|
|
goto out_free_path;
|
2013-01-29 03:18:40 +00:00
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(l, &key, slot);
|
|
|
|
|
2017-01-10 18:35:31 +00:00
|
|
|
if (key.objectid != btrfs_ino(BTRFS_I(inode)))
|
2013-01-29 03:18:40 +00:00
|
|
|
break;
|
|
|
|
if (key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
break;
|
|
|
|
if (key.offset >= new->file_pos + new->len)
|
|
|
|
break;
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
num_bytes = btrfs_file_extent_num_bytes(l, extent);
|
|
|
|
if (key.offset + num_bytes < new->file_pos)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
|
|
|
|
if (!disk_bytenr)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
extent_offset = btrfs_file_extent_offset(l, extent);
|
|
|
|
|
|
|
|
old = kmalloc(sizeof(*old), GFP_NOFS);
|
|
|
|
if (!old)
|
2013-10-29 02:45:05 +00:00
|
|
|
goto out_free_path;
|
2013-01-29 03:18:40 +00:00
|
|
|
|
|
|
|
offset = max(new->file_pos, key.offset);
|
|
|
|
end = min(new->file_pos + new->len, key.offset + num_bytes);
|
|
|
|
|
|
|
|
old->bytenr = disk_bytenr;
|
|
|
|
old->extent_offset = extent_offset;
|
|
|
|
old->offset = offset - key.offset;
|
|
|
|
old->len = end - offset;
|
|
|
|
old->new = new;
|
|
|
|
old->count = 0;
|
|
|
|
list_add_tail(&old->list, &new->head);
|
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
2016-06-22 22:54:23 +00:00
|
|
|
atomic_inc(&fs_info->defrag_running);
|
2013-01-29 03:18:40 +00:00
|
|
|
|
|
|
|
return new;
|
|
|
|
|
|
|
|
out_free_path:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
out_kfree:
|
2013-10-29 02:45:05 +00:00
|
|
|
free_sa_defrag_extent(new);
|
2013-01-29 03:18:40 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache;
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, start);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
ASSERT(cache);
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
cache->delalloc_bytes -= len;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/* as ordered data IO finishes, this gets called so we can finish
|
|
|
|
* an ordered extent if the range of bytes in the file it covers are
|
|
|
|
* fully written.
|
|
|
|
*/
|
2012-05-02 18:00:54 +00:00
|
|
|
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
|
2008-07-17 16:53:50 +00:00
|
|
|
{
|
2012-05-02 18:00:54 +00:00
|
|
|
struct inode *inode = ordered_extent->inode;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-07-17 16:53:50 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 14:48:47 +00:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2008-07-17 16:53:50 +00:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2013-01-29 03:18:40 +00:00
|
|
|
struct new_sa_defrag_extent *new = NULL;
|
2010-12-17 06:21:50 +00:00
|
|
|
int compress_type = 0;
|
2013-08-29 17:57:21 +00:00
|
|
|
int ret = 0;
|
|
|
|
u64 logical_len = ordered_extent->len;
|
2011-04-20 02:33:24 +00:00
|
|
|
bool nolock;
|
2013-08-29 17:57:21 +00:00
|
|
|
bool truncated = false;
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
bool range_locked = false;
|
|
|
|
bool clear_new_delalloc_bytes = false;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
|
|
|
|
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
|
|
|
|
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
|
|
|
|
clear_new_delalloc_bytes = true;
|
2008-07-17 16:53:50 +00:00
|
|
|
|
2017-02-20 11:50:35 +00:00
|
|
|
nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
|
2010-07-02 16:14:14 +00:00
|
|
|
|
2012-05-02 18:00:54 +00:00
|
|
|
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:50:57 +00:00
|
|
|
btrfs_free_io_failure_record(BTRFS_I(inode),
|
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
|
|
|
ordered_extent->len - 1);
|
Btrfs: cleanup the read failure record after write or when the inode is freeing
After the data is written successfully, we should cleanup the read failure record
in that range because
- If we set data COW for the file, the range that the failure record pointed to is
mapped to a new place, so it is invalid.
- If we set no data COW for the file, and if there is no error during writting,
the corrupted data is corrected, so the failure record can be removed. And if
some errors happen on the mirrors, we also needn't worry about it because the
failure record will be recreated if we read the same place again.
Sometimes, we may fail to correct the data, so the failure records will be left
in the tree, we need free them when we free the inode or the memory leak happens.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-12 10:44:04 +00:00
|
|
|
|
2013-08-29 17:57:21 +00:00
|
|
|
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
|
|
|
|
truncated = true;
|
|
|
|
logical_len = ordered_extent->truncated_len;
|
|
|
|
/* Truncated the entire extent, don't bother adding */
|
|
|
|
if (!logical_len)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2009-11-12 09:34:21 +00:00
|
|
|
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
|
2015-09-08 09:25:56 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For mwrite(mmap + memset to write) case, we still reserve
|
|
|
|
* space for NOCOW range.
|
|
|
|
* As NOCOW won't cause a new delayed ref, just free the space
|
|
|
|
*/
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
|
2015-09-08 09:25:56 +00:00
|
|
|
ordered_extent->len);
|
2012-11-09 15:53:21 +00:00
|
|
|
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
|
|
|
|
if (nolock)
|
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
|
|
|
else
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
goto out;
|
2009-11-12 09:34:21 +00:00
|
|
|
}
|
2017-10-19 18:15:57 +00:00
|
|
|
trans->block_rsv = &BTRFS_I(inode)->block_rsv;
|
2012-11-09 15:53:21 +00:00
|
|
|
ret = btrfs_update_inode_fallback(trans, root, inode);
|
|
|
|
if (ret) /* -ENOMEM or corruption */
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2009-11-12 09:34:21 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2008-07-17 16:53:50 +00:00
|
|
|
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
range_locked = true;
|
2010-02-03 19:33:23 +00:00
|
|
|
lock_extent_bits(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
2015-12-03 13:30:40 +00:00
|
|
|
&cached_state);
|
2008-07-17 16:53:50 +00:00
|
|
|
|
2013-01-29 03:18:40 +00:00
|
|
|
ret = test_range_bit(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
2017-05-26 23:44:23 +00:00
|
|
|
EXTENT_DEFRAG, 0, cached_state);
|
2013-01-29 03:18:40 +00:00
|
|
|
if (ret) {
|
|
|
|
u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
|
2014-01-29 21:05:30 +00:00
|
|
|
if (0 && last_snapshot >= BTRFS_I(inode)->generation)
|
2013-01-29 03:18:40 +00:00
|
|
|
/* the inode is shared */
|
|
|
|
new = record_old_file_extents(inode, ordered_extent);
|
|
|
|
|
|
|
|
clear_extent_bit(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
2017-10-31 15:37:52 +00:00
|
|
|
EXTENT_DEFRAG, 0, 0, &cached_state);
|
2013-01-29 03:18:40 +00:00
|
|
|
}
|
|
|
|
|
2010-07-02 16:14:14 +00:00
|
|
|
if (nolock)
|
2011-04-13 16:54:33 +00:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2010-07-02 16:14:14 +00:00
|
|
|
else
|
2011-04-13 16:54:33 +00:00
|
|
|
trans = btrfs_join_transaction(root);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
goto out;
|
2012-03-12 15:03:00 +00:00
|
|
|
}
|
2014-05-22 23:18:52 +00:00
|
|
|
|
2017-10-19 18:15:57 +00:00
|
|
|
trans->block_rsv = &BTRFS_I(inode)->block_rsv;
|
2009-11-12 09:34:21 +00:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
|
2010-12-17 06:21:50 +00:00
|
|
|
compress_type = ordered_extent->compress_type;
|
2008-10-30 18:25:28 +00:00
|
|
|
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
|
2010-12-17 06:21:50 +00:00
|
|
|
BUG_ON(compress_type);
|
2017-10-30 22:29:10 +00:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
|
|
|
|
ordered_extent->len);
|
2017-02-20 11:50:48 +00:00
|
|
|
ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
|
2008-10-30 18:25:28 +00:00
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
2013-08-29 17:57:21 +00:00
|
|
|
logical_len);
|
2008-10-30 18:25:28 +00:00
|
|
|
} else {
|
2016-06-22 22:54:23 +00:00
|
|
|
BUG_ON(root == fs_info->tree_root);
|
2008-10-30 18:25:28 +00:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->start,
|
|
|
|
ordered_extent->disk_len,
|
2013-08-29 17:57:21 +00:00
|
|
|
logical_len, logical_len,
|
2010-12-17 06:21:50 +00:00
|
|
|
compress_type, 0, 0,
|
2008-10-30 18:25:28 +00:00
|
|
|
BTRFS_FILE_EXTENT_REG);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
if (!ret)
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_release_delalloc_bytes(fs_info,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
ordered_extent->start,
|
|
|
|
ordered_extent->disk_len);
|
2008-10-30 18:25:28 +00:00
|
|
|
}
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
|
|
|
|
ordered_extent->file_offset, ordered_extent->len,
|
|
|
|
trans->transid);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret < 0) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
goto out;
|
2012-03-12 15:03:00 +00:00
|
|
|
}
|
2010-02-03 19:33:23 +00:00
|
|
|
|
2018-01-08 08:59:43 +00:00
|
|
|
ret = add_pending_csums(trans, inode, &ordered_extent->list);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2008-07-17 16:53:50 +00:00
|
|
|
|
2012-11-09 15:53:21 +00:00
|
|
|
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
|
|
|
|
ret = btrfs_update_inode_fallback(trans, root, inode);
|
|
|
|
if (ret) { /* -ENOMEM or corruption */
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
goto out;
|
2011-04-05 23:25:36 +00:00
|
|
|
}
|
|
|
|
ret = 0;
|
2009-11-12 09:34:21 +00:00
|
|
|
out:
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
if (range_locked || clear_new_delalloc_bytes) {
|
|
|
|
unsigned int clear_bits = 0;
|
|
|
|
|
|
|
|
if (range_locked)
|
|
|
|
clear_bits |= EXTENT_LOCKED;
|
|
|
|
if (clear_new_delalloc_bytes)
|
|
|
|
clear_bits |= EXTENT_DELALLOC_NEW;
|
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree,
|
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
|
|
|
ordered_extent->len - 1,
|
|
|
|
clear_bits,
|
|
|
|
(clear_bits & EXTENT_LOCKED) ? 1 : 0,
|
2017-10-31 15:37:52 +00:00
|
|
|
0, &cached_state);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
}
|
|
|
|
|
2012-09-20 07:51:59 +00:00
|
|
|
if (trans)
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2010-07-02 16:14:14 +00:00
|
|
|
|
2013-08-29 17:57:21 +00:00
|
|
|
if (ret || truncated) {
|
|
|
|
u64 start, end;
|
|
|
|
|
|
|
|
if (truncated)
|
|
|
|
start = ordered_extent->file_offset + logical_len;
|
|
|
|
else
|
|
|
|
start = ordered_extent->file_offset;
|
|
|
|
end = ordered_extent->file_offset + ordered_extent->len - 1;
|
2017-10-31 16:02:39 +00:00
|
|
|
clear_extent_uptodate(io_tree, start, end, NULL);
|
2013-08-29 17:57:21 +00:00
|
|
|
|
|
|
|
/* Drop the cache for the part of the extent we didn't write. */
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
|
2012-05-02 18:00:54 +00:00
|
|
|
|
2013-01-31 19:58:00 +00:00
|
|
|
/*
|
|
|
|
* If the ordered extent had an IOERR or something else went
|
|
|
|
* wrong we need to return the space for this ordered extent
|
2013-08-29 17:57:21 +00:00
|
|
|
* back to the allocator. We only free the extent in the
|
|
|
|
* truncated case if we didn't write out the extent at all.
|
2013-01-31 19:58:00 +00:00
|
|
|
*/
|
2013-08-29 17:57:21 +00:00
|
|
|
if ((ret || !logical_len) &&
|
|
|
|
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
|
2013-01-31 19:58:00 +00:00
|
|
|
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_free_reserved_extent(fs_info,
|
|
|
|
ordered_extent->start,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
ordered_extent->disk_len, 1);
|
2013-01-31 19:58:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-05-02 18:00:54 +00:00
|
|
|
/*
|
2012-06-18 04:14:23 +00:00
|
|
|
* This needs to be done to make sure anybody waiting knows we are done
|
|
|
|
* updating everything for this ordered extent.
|
2012-05-02 18:00:54 +00:00
|
|
|
*/
|
|
|
|
btrfs_remove_ordered_extent(inode, ordered_extent);
|
|
|
|
|
2013-01-29 03:18:40 +00:00
|
|
|
/* for snapshot-aware defrag */
|
2013-10-29 02:45:05 +00:00
|
|
|
if (new) {
|
|
|
|
if (ret) {
|
|
|
|
free_sa_defrag_extent(new);
|
2016-06-22 22:54:23 +00:00
|
|
|
atomic_dec(&fs_info->defrag_running);
|
2013-10-29 02:45:05 +00:00
|
|
|
} else {
|
|
|
|
relink_file_extents(new);
|
|
|
|
}
|
|
|
|
}
|
2013-01-29 03:18:40 +00:00
|
|
|
|
2008-07-17 16:53:50 +00:00
|
|
|
/* once for us */
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
/* once for the tree */
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
|
btrfs: balance dirty metadata pages in btrfs_finish_ordered_io
[Problem description and how we fix it]
We should balance dirty metadata pages at the end of
btrfs_finish_ordered_io, since a small, unmergeable random write can
potentially produce dirty metadata which is multiple times larger than
the data itself. For example, a small, unmergeable 4KiB write may
produce:
16KiB dirty leaf (and possibly 16KiB dirty node) in subvolume tree
16KiB dirty leaf (and possibly 16KiB dirty node) in checksum tree
16KiB dirty leaf (and possibly 16KiB dirty node) in extent tree
Although we do call balance dirty pages in write side, but in the
buffered write path, most metadata are dirtied only after we reach the
dirty background limit (which by far only counts dirty data pages) and
wakeup the flusher thread. If there are many small, unmergeable random
writes spread in a large btree, we'll find a burst of dirty pages
exceeds the dirty_bytes limit after we wakeup the flusher thread - which
is not what we expect. In our machine, it caused out-of-memory problem
since a page cannot be dropped if it is marked dirty.
Someone may worry about we may sleep in btrfs_btree_balance_dirty_nodelay,
but since we do btrfs_finish_ordered_io in a separate worker, it will not
stop the flusher consuming dirty pages. Also, we use different worker for
metadata writeback endio, sleep in btrfs_finish_ordered_io help us throttle
the size of dirty metadata pages.
[Reproduce steps]
To reproduce the problem, we need to do 4KiB write randomly spread in a
large btree. In our 2GiB RAM machine:
1) Create 4 subvolumes.
2) Run fio on each subvolume:
[global]
direct=0
rw=randwrite
ioengine=libaio
bs=4k
iodepth=16
numjobs=1
group_reporting
size=128G
runtime=1800
norandommap
time_based
randrepeat=0
3) Take snapshot on each subvolume and repeat fio on existing files.
4) Repeat step (3) until we get large btrees.
In our case, by observing btrfs_root_item->bytes_used, we have 2GiB of
metadata in each subvolume tree and 12GiB of metadata in extent tree.
5) Stop all fio, take snapshot again, and wait until all delayed work is
completed.
6) Start all fio. Few seconds later we hit OOM when the flusher starts
to work.
It can be reproduced even when using nocow write.
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-28 05:48:20 +00:00
|
|
|
/* Try to release some metadata so we don't get an OOM but don't wait */
|
|
|
|
btrfs_btree_balance_dirty_nodelay(fs_info);
|
|
|
|
|
2012-05-02 18:00:54 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void finish_ordered_fn(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct btrfs_ordered_extent *ordered_extent;
|
|
|
|
ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
|
|
|
|
btrfs_finish_ordered_io(ordered_extent);
|
2008-07-17 16:53:50 +00:00
|
|
|
}
|
|
|
|
|
2017-02-17 14:18:32 +00:00
|
|
|
static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
|
2008-07-18 15:56:15 +00:00
|
|
|
struct extent_state *state, int uptodate)
|
|
|
|
{
|
2012-05-02 18:00:54 +00:00
|
|
|
struct inode *inode = page->mapping->host;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2012-05-02 18:00:54 +00:00
|
|
|
struct btrfs_ordered_extent *ordered_extent = NULL;
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
struct btrfs_workqueue *wq;
|
|
|
|
btrfs_work_func_t func;
|
2012-05-02 18:00:54 +00:00
|
|
|
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
|
|
|
|
|
2009-09-02 20:53:46 +00:00
|
|
|
ClearPagePrivate2(page);
|
2012-05-02 18:00:54 +00:00
|
|
|
if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
|
|
|
|
end - start + 1, uptodate))
|
2017-02-17 14:18:32 +00:00
|
|
|
return;
|
2012-05-02 18:00:54 +00:00
|
|
|
|
2017-02-20 11:50:35 +00:00
|
|
|
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
|
2016-06-22 22:54:23 +00:00
|
|
|
wq = fs_info->endio_freespace_worker;
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
func = btrfs_freespace_write_helper;
|
|
|
|
} else {
|
2016-06-22 22:54:23 +00:00
|
|
|
wq = fs_info->endio_write_workers;
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
func = btrfs_endio_write_helper;
|
|
|
|
}
|
2012-05-02 18:00:54 +00:00
|
|
|
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
|
|
|
|
NULL);
|
|
|
|
btrfs_queue_work(wq, &ordered_extent->work);
|
2008-07-18 15:56:15 +00:00
|
|
|
}
|
|
|
|
|
2014-09-12 10:43:55 +00:00
|
|
|
static int __readpage_endio_check(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio,
|
|
|
|
int icsum, struct page *page,
|
|
|
|
int pgoff, u64 start, size_t len)
|
|
|
|
{
|
|
|
|
char *kaddr;
|
|
|
|
u32 csum_expected;
|
|
|
|
u32 csum = ~(u32)0;
|
|
|
|
|
|
|
|
csum_expected = *(((u32 *)io_bio->csum) + icsum);
|
|
|
|
|
|
|
|
kaddr = kmap_atomic(page);
|
|
|
|
csum = btrfs_csum_data(kaddr + pgoff, csum, len);
|
2016-10-27 07:52:33 +00:00
|
|
|
btrfs_csum_final(csum, (u8 *)&csum);
|
2014-09-12 10:43:55 +00:00
|
|
|
if (csum != csum_expected)
|
|
|
|
goto zeroit;
|
|
|
|
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return 0;
|
|
|
|
zeroit:
|
2017-02-20 11:50:53 +00:00
|
|
|
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
|
2017-02-09 02:45:06 +00:00
|
|
|
io_bio->mirror_num);
|
2014-09-12 10:43:55 +00:00
|
|
|
memset(kaddr + pgoff, 1, len);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* when reads are done, we need to check csums to verify the data is correct
|
2011-07-22 13:41:52 +00:00
|
|
|
* if there's a match, we allow the bio to finish. If not, the code in
|
|
|
|
* extent_io.c will try to find good copies for us.
|
2008-09-29 19:18:18 +00:00
|
|
|
*/
|
2013-07-25 11:22:34 +00:00
|
|
|
static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
|
|
|
|
u64 phy_offset, struct page *page,
|
|
|
|
u64 start, u64 end, int mirror)
|
2007-08-30 12:50:51 +00:00
|
|
|
{
|
2012-12-21 09:17:45 +00:00
|
|
|
size_t offset = start - page_offset(page);
|
2007-08-30 12:50:51 +00:00
|
|
|
struct inode *inode = page->mapping->host;
|
2008-01-24 21:13:08 +00:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2007-10-15 20:22:25 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-01-24 21:13:08 +00:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 21:58:54 +00:00
|
|
|
if (PageChecked(page)) {
|
|
|
|
ClearPageChecked(page);
|
2014-09-12 10:43:55 +00:00
|
|
|
return 0;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 21:58:54 +00:00
|
|
|
}
|
2009-04-17 08:37:41 +00:00
|
|
|
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
2014-09-12 10:43:55 +00:00
|
|
|
return 0;
|
2008-12-12 15:03:38 +00:00
|
|
|
|
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
|
2009-09-02 19:22:30 +00:00
|
|
|
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
|
2016-04-26 21:54:39 +00:00
|
|
|
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
|
2007-12-14 20:30:32 +00:00
|
|
|
return 0;
|
2008-12-12 15:03:38 +00:00
|
|
|
}
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 21:58:54 +00:00
|
|
|
|
2013-07-25 11:22:34 +00:00
|
|
|
phy_offset >>= inode->i_sb->s_blocksize_bits;
|
2014-09-12 10:43:55 +00:00
|
|
|
return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
|
|
|
|
start, (size_t)(end - start + 1));
|
2007-08-30 12:50:51 +00:00
|
|
|
}
|
2007-08-27 20:49:44 +00:00
|
|
|
|
2018-01-16 07:31:58 +00:00
|
|
|
/*
|
|
|
|
* btrfs_add_delayed_iput - perform a delayed iput on @inode
|
|
|
|
*
|
|
|
|
* @inode: The inode we want to perform iput on
|
|
|
|
*
|
|
|
|
* This function uses the generic vfs_inode::i_count to track whether we should
|
|
|
|
* just decrement it (in case it's > 1) or if this is the last iput then link
|
|
|
|
* the inode to the delayed iput machinery. Delayed iputs are processed at
|
|
|
|
* transaction commit time/superblock commit/cleaner kthread.
|
|
|
|
*/
|
2009-11-12 09:36:34 +00:00
|
|
|
void btrfs_add_delayed_iput(struct inode *inode)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2015-11-19 13:15:51 +00:00
|
|
|
struct btrfs_inode *binode = BTRFS_I(inode);
|
2009-11-12 09:36:34 +00:00
|
|
|
|
|
|
|
if (atomic_add_unless(&inode->i_count, -1, 1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2018-01-16 07:31:58 +00:00
|
|
|
ASSERT(list_empty(&binode->delayed_iput));
|
|
|
|
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
|
2009-11-12 09:36:34 +00:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
|
2009-11-12 09:36:34 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2015-11-19 13:15:51 +00:00
|
|
|
while (!list_empty(&fs_info->delayed_iputs)) {
|
|
|
|
struct btrfs_inode *inode;
|
|
|
|
|
|
|
|
inode = list_first_entry(&fs_info->delayed_iputs,
|
|
|
|
struct btrfs_inode, delayed_iput);
|
2018-01-16 07:31:58 +00:00
|
|
|
list_del_init(&inode->delayed_iput);
|
2015-11-19 13:15:51 +00:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
iput(&inode->vfs_inode);
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2009-11-12 09:36:34 +00:00
|
|
|
}
|
2015-11-19 13:15:51 +00:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
2009-11-12 09:36:34 +00:00
|
|
|
}
|
|
|
|
|
2008-07-24 16:17:14 +00:00
|
|
|
/*
|
2018-05-11 20:13:32 +00:00
|
|
|
* This creates an orphan entry for the given inode in case something goes wrong
|
|
|
|
* in the middle of an unlink.
|
2008-07-24 16:17:14 +00:00
|
|
|
*/
|
2017-02-20 11:50:59 +00:00
|
|
|
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
|
2018-05-11 20:13:37 +00:00
|
|
|
struct btrfs_inode *inode)
|
2008-07-24 16:17:14 +00:00
|
|
|
{
|
2010-05-16 14:49:58 +00:00
|
|
|
int ret;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2018-05-11 20:13:37 +00:00
|
|
|
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
|
|
|
|
if (ret && ret != -EEXIST) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
return ret;
|
2010-05-16 14:49:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2008-07-24 16:17:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2018-05-11 20:13:32 +00:00
|
|
|
* We have done the delete so we can go ahead and remove the orphan item for
|
|
|
|
* this particular inode.
|
2008-07-24 16:17:14 +00:00
|
|
|
*/
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
|
2017-02-20 11:50:58 +00:00
|
|
|
struct btrfs_inode *inode)
|
2008-07-24 16:17:14 +00:00
|
|
|
{
|
2018-05-11 20:13:37 +00:00
|
|
|
return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
|
2008-07-24 16:17:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this cleans up any orphans that may be left on the list from the last use
|
|
|
|
* of this root.
|
|
|
|
*/
|
2011-01-31 21:22:42 +00:00
|
|
|
int btrfs_orphan_cleanup(struct btrfs_root *root)
|
2008-07-24 16:17:14 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-07-24 16:17:14 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct inode *inode;
|
2011-09-26 19:55:20 +00:00
|
|
|
u64 last_objectid = 0;
|
2018-05-11 20:13:32 +00:00
|
|
|
int ret = 0, nr_unlink = 0;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2010-05-16 14:49:58 +00:00
|
|
|
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
|
2011-01-31 21:22:42 +00:00
|
|
|
return 0;
|
2009-11-12 09:34:40 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-01-31 21:22:42 +00:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2015-11-27 15:31:35 +00:00
|
|
|
path->reada = READA_BACK;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
|
|
|
key.objectid = BTRFS_ORPHAN_OBJECTID;
|
2014-06-04 16:41:45 +00:00
|
|
|
key.type = BTRFS_ORPHAN_ITEM_KEY;
|
2008-07-24 16:17:14 +00:00
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
2011-01-31 21:22:42 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* if ret == 0 means we found what we were searching for, which
|
2011-03-31 01:57:33 +00:00
|
|
|
* is weird, but possible, so only screw with path if we didn't
|
2008-07-24 16:17:14 +00:00
|
|
|
* find the key and see if we have stuff that matches
|
|
|
|
*/
|
|
|
|
if (ret > 0) {
|
2011-01-31 21:22:42 +00:00
|
|
|
ret = 0;
|
2008-07-24 16:17:14 +00:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
break;
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pull out the item */
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
|
|
/* make sure the item matches what we want */
|
|
|
|
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
|
|
|
|
break;
|
2014-06-04 16:41:45 +00:00
|
|
|
if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
|
2008-07-24 16:17:14 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* release the path since we're done with it */
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2008-07-24 16:17:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this is where we are basically btrfs_lookup, without the
|
|
|
|
* crossing root thing. we store the inode number in the
|
|
|
|
* offset of the orphan item.
|
|
|
|
*/
|
2011-09-26 19:55:20 +00:00
|
|
|
|
|
|
|
if (found_key.offset == last_objectid) {
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"Error removing orphan entry, stopping orphan cleanup");
|
2011-09-26 19:55:20 +00:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
last_objectid = found_key.offset;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
found_key.objectid = found_key.offset;
|
|
|
|
found_key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
found_key.offset = 0;
|
2016-06-22 22:54:23 +00:00
|
|
|
inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
|
2013-07-15 01:50:32 +00:00
|
|
|
ret = PTR_ERR_OR_ZERO(inode);
|
2016-06-06 10:51:25 +00:00
|
|
|
if (ret && ret != -ENOENT)
|
2011-01-31 21:22:42 +00:00
|
|
|
goto out;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
if (ret == -ENOENT && root == fs_info->tree_root) {
|
2011-12-15 01:12:02 +00:00
|
|
|
struct btrfs_root *dead_root;
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
int is_dead_root = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this is an orphan in the tree root. Currently these
|
|
|
|
* could come from 2 sources:
|
|
|
|
* a) a snapshot deletion in progress
|
|
|
|
* b) a free space cache inode
|
|
|
|
* We need to distinguish those two, as the snapshot
|
|
|
|
* orphan must not get deleted.
|
|
|
|
* find_dead_roots already ran before us, so if this
|
|
|
|
* is a snapshot deletion, we should find the root
|
|
|
|
* in the dead_roots list
|
|
|
|
*/
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
list_for_each_entry(dead_root, &fs_info->dead_roots,
|
|
|
|
root_list) {
|
|
|
|
if (dead_root->root_key.objectid ==
|
|
|
|
found_key.objectid) {
|
|
|
|
is_dead_root = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
if (is_dead_root) {
|
|
|
|
/* prevent this orphan from being found again */
|
|
|
|
key.offset = found_key.objectid - 1;
|
|
|
|
continue;
|
|
|
|
}
|
2018-05-11 20:13:32 +00:00
|
|
|
|
2011-12-15 01:12:02 +00:00
|
|
|
}
|
2018-05-11 20:13:32 +00:00
|
|
|
|
2008-07-24 16:17:14 +00:00
|
|
|
/*
|
2018-05-11 20:13:32 +00:00
|
|
|
* If we have an inode with links, there are a couple of
|
|
|
|
* possibilities. Old kernels (before v3.12) used to create an
|
|
|
|
* orphan item for truncate indicating that there were possibly
|
|
|
|
* extent items past i_size that needed to be deleted. In v3.12,
|
|
|
|
* truncate was changed to update i_size in sync with the extent
|
|
|
|
* items, but the (useless) orphan item was still created. Since
|
|
|
|
* v4.18, we don't create the orphan item for truncate at all.
|
|
|
|
*
|
|
|
|
* So, this item could mean that we need to do a truncate, but
|
|
|
|
* only if this filesystem was last used on a pre-v3.12 kernel
|
|
|
|
* and was not cleanly unmounted. The odds of that are quite
|
|
|
|
* slim, and it's a pain to do the truncate now, so just delete
|
|
|
|
* the orphan item.
|
|
|
|
*
|
|
|
|
* It's also possible that this orphan item was supposed to be
|
|
|
|
* deleted but wasn't. The inode number may have been reused,
|
|
|
|
* but either way, we can delete the orphan item.
|
2008-07-24 16:17:14 +00:00
|
|
|
*/
|
2018-05-11 20:13:32 +00:00
|
|
|
if (ret == -ENOENT || inode->i_nlink) {
|
|
|
|
if (!ret)
|
|
|
|
iput(inode);
|
2011-09-21 20:55:59 +00:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2011-01-31 21:22:42 +00:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_debug(fs_info, "auto deleting %Lu",
|
|
|
|
found_key.objectid);
|
2011-09-21 20:55:59 +00:00
|
|
|
ret = btrfs_del_orphan_item(trans, root,
|
|
|
|
found_key.objectid);
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2013-08-13 18:10:08 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-24 16:17:14 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:13:32 +00:00
|
|
|
nr_unlink++;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
|
|
|
/* this will do delete_inode and everything for us */
|
|
|
|
iput(inode);
|
2011-01-31 21:22:42 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-24 16:17:14 +00:00
|
|
|
}
|
2011-11-11 01:45:05 +00:00
|
|
|
/* release the path since we're done with it */
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
2010-05-16 14:49:58 +00:00
|
|
|
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
|
|
|
|
|
2018-05-11 20:13:38 +00:00
|
|
|
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
|
2011-04-13 16:54:33 +00:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-31 21:22:42 +00:00
|
|
|
if (!IS_ERR(trans))
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2010-05-16 14:49:58 +00:00
|
|
|
}
|
2008-07-24 16:17:14 +00:00
|
|
|
|
|
|
|
if (nr_unlink)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
|
2011-01-31 21:22:42 +00:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (ret)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
|
2011-01-31 21:22:42 +00:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
2008-07-24 16:17:14 +00:00
|
|
|
}
|
|
|
|
|
2009-04-27 15:47:50 +00:00
|
|
|
/*
|
|
|
|
* very simple check to peek ahead in the leaf looking for xattrs. If we
|
|
|
|
* don't find any xattrs, we know there can't be any acls.
|
|
|
|
*
|
|
|
|
* slot is the slot the inode is in, objectid is the objectid of the inode
|
|
|
|
*/
|
|
|
|
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
int slot, u64 objectid,
|
|
|
|
int *first_xattr_slot)
|
2009-04-27 15:47:50 +00:00
|
|
|
{
|
|
|
|
u32 nritems = btrfs_header_nritems(leaf);
|
|
|
|
struct btrfs_key found_key;
|
2013-06-19 14:16:26 +00:00
|
|
|
static u64 xattr_access = 0;
|
|
|
|
static u64 xattr_default = 0;
|
2009-04-27 15:47:50 +00:00
|
|
|
int scanned = 0;
|
|
|
|
|
2013-06-19 14:16:26 +00:00
|
|
|
if (!xattr_access) {
|
2015-12-02 13:44:35 +00:00
|
|
|
xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
|
|
|
|
strlen(XATTR_NAME_POSIX_ACL_ACCESS));
|
|
|
|
xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
|
|
|
|
strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
|
2013-06-19 14:16:26 +00:00
|
|
|
}
|
|
|
|
|
2009-04-27 15:47:50 +00:00
|
|
|
slot++;
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
*first_xattr_slot = -1;
|
2009-04-27 15:47:50 +00:00
|
|
|
while (slot < nritems) {
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
/* we found a different objectid, there must not be acls */
|
|
|
|
if (found_key.objectid != objectid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* we found an xattr, assume we've got an acl */
|
2013-06-19 14:16:26 +00:00
|
|
|
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
if (*first_xattr_slot == -1)
|
|
|
|
*first_xattr_slot = slot;
|
2013-06-19 14:16:26 +00:00
|
|
|
if (found_key.offset == xattr_access ||
|
|
|
|
found_key.offset == xattr_default)
|
|
|
|
return 1;
|
|
|
|
}
|
2009-04-27 15:47:50 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we found a key greater than an xattr key, there can't
|
|
|
|
* be any acls later on
|
|
|
|
*/
|
|
|
|
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
slot++;
|
|
|
|
scanned++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* it goes inode, inode backrefs, xattrs, extents,
|
|
|
|
* so if there are a ton of hard links to an inode there can
|
|
|
|
* be a lot of backrefs. Don't waste time searching too hard,
|
|
|
|
* this is just an optimization
|
|
|
|
*/
|
|
|
|
if (scanned >= 8)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* we hit the end of the leaf before we found an xattr or
|
|
|
|
* something larger than an xattr. We have to assume the inode
|
|
|
|
* has acls
|
|
|
|
*/
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
if (*first_xattr_slot == -1)
|
|
|
|
*first_xattr_slot = slot;
|
2009-04-27 15:47:50 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* read an inode from the btree into the in-memory inode
|
|
|
|
*/
|
2016-06-06 10:51:25 +00:00
|
|
|
static int btrfs_read_locked_inode(struct inode *inode)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_path *path;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key location;
|
2013-12-26 05:07:06 +00:00
|
|
|
unsigned long ptr;
|
2009-04-27 15:47:50 +00:00
|
|
|
int maybe_acls;
|
2007-07-11 14:18:17 +00:00
|
|
|
u32 rdev;
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret;
|
2011-06-23 07:27:13 +00:00
|
|
|
bool filled = false;
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
int first_xattr_slot;
|
2011-06-23 07:27:13 +00:00
|
|
|
|
|
|
|
ret = btrfs_fill_inode(inode, &rdev);
|
|
|
|
if (!ret)
|
|
|
|
filled = true;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2016-06-06 10:51:25 +00:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
2011-07-12 18:25:31 +00:00
|
|
|
goto make_bad;
|
2016-06-06 10:51:25 +00:00
|
|
|
}
|
2011-07-12 18:25:31 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
|
2008-01-08 20:46:30 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
|
2016-06-06 10:51:25 +00:00
|
|
|
if (ret) {
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
2007-06-12 10:35:45 +00:00
|
|
|
goto make_bad;
|
2016-06-06 10:51:25 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
2011-06-23 07:27:13 +00:00
|
|
|
|
|
|
|
if (filled)
|
2013-12-26 05:07:06 +00:00
|
|
|
goto cache_index;
|
2011-06-23 07:27:13 +00:00
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
|
2011-10-28 12:13:29 +00:00
|
|
|
set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
|
2012-02-10 19:05:07 +00:00
|
|
|
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
|
|
|
|
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2014-12-12 16:39:12 +00:00
|
|
|
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
|
|
|
|
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2014-12-12 16:39:12 +00:00
|
|
|
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
|
|
|
|
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2014-12-12 16:39:12 +00:00
|
|
|
inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
|
|
|
|
inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2012-07-04 07:18:07 +00:00
|
|
|
BTRFS_I(inode)->i_otime.tv_sec =
|
|
|
|
btrfs_timespec_sec(leaf, &inode_item->otime);
|
|
|
|
BTRFS_I(inode)->i_otime.tv_nsec =
|
|
|
|
btrfs_timespec_nsec(leaf, &inode_item->otime);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2008-10-09 15:46:29 +00:00
|
|
|
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
|
2008-09-05 20:13:11 +00:00
|
|
|
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
|
|
|
|
|
2017-12-11 11:35:12 +00:00
|
|
|
inode_set_iversion_queried(inode,
|
|
|
|
btrfs_inode_sequence(leaf, inode_item));
|
2015-04-09 04:08:43 +00:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
|
|
|
inode->i_rdev = 0;
|
|
|
|
rdev = btrfs_inode_rdev(leaf, inode_item);
|
|
|
|
|
|
|
|
BTRFS_I(inode)->index_cnt = (u64)-1;
|
|
|
|
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
|
|
|
|
|
|
|
|
cache_index:
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
/*
|
|
|
|
* If we were modified in the current generation and evicted from memory
|
|
|
|
* and then re-read we need to do a full sync since we don't have any
|
|
|
|
* idea about which extents were modified before we were evicted from
|
|
|
|
* cache.
|
2015-04-09 04:08:43 +00:00
|
|
|
*
|
|
|
|
* This is required for both inode re-read from disk and delayed inode
|
|
|
|
* in delayed_nodes_tree.
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
*/
|
2016-06-22 22:54:23 +00:00
|
|
|
if (BTRFS_I(inode)->last_trans == fs_info->generation)
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
|
Btrfs: fix stale dir entries after unlink, inode eviction and fsync
If we remove a hard link from an inode, the inode gets evicted, then
we fsync the inode and then power fail/crash, when the log tree is
replayed, the parent directory inode still has entries pointing to
the name that no longer exists, while our inode no longer has the
BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected),
leaving the filesystem in an inconsistent state. The stale directory
entries can not be deleted (an attempt to delete them causes -ESTALE
errors), which makes it impossible to delete the parent directory.
This happens because we track the id of the transaction where the last
unlink operation for the inode happened (last_unlink_trans) in an
in-memory only field of the inode, that is, a value that is never
persisted in the inode item stored on the fs/subvol btree. So if an
inode is evicted and loaded again, the value for last_unlink_trans is
set to 0, which prevents the fsync from logging the parent directory
at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans
to the id of the transaction that last modified the inode when we
load the inode. This is a pessimistic approach but it always ensures
correctness with the trade off of ocassional full transaction commits
when an fsync is done against the inode in the same transaction where
it was evicted and reloaded when our inode is a directory and often
logging its parent unnecessarily when our inode is not a directory.
The following test case for fstests triggers the problem:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs generic
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file with 2 hard links.
mkdir $SCRATCH_MNT/testdir
touch $SCRATCH_MNT/testdir/foo
ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
# Make sure everything done so far is durably persisted.
sync
# Now remove one of the links, trigger inode eviction and then fsync
# our inode.
unlink $SCRATCH_MNT/testdir/bar
echo 2 > /proc/sys/vm/drop_caches
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo
# Silently drop all writes on our scratch device to simulate a power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again and mount the fs to trigger log/journal replay.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Now verify our directory entries.
echo "Entries in testdir:"
ls -1 $SCRATCH_MNT/testdir
# If we remove our inode, its parent should become empty and therefore we should
# be able to remove the parent.
rm -f $SCRATCH_MNT/testdir/*
rmdir $SCRATCH_MNT/testdir
_unmount_flakey
# The fstests framework will call fsck against our filesystem which will verify
# that all metadata is in a consistent state.
status=0
exit
The test failed on btrfs with:
generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad)
--- tests/generic/098.out 2015-07-23 18:01:12.616175932 +0100
+++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad 2015-07-23 18:04:58.924138308 +0100
@@ -1,3 +1,6 @@
QA output created by 098
Entries in testdir:
+bar
foo
+rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle
+rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
...
(Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad' to see the entire diff)
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full)
$ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full
(...)
checking fs roots
root 5 inode 258 errors 2001, no inode item, link count wrong
unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref
Checking filesystem on /dev/sdc
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-07-23 23:00:19 +00:00
|
|
|
/*
|
|
|
|
* We don't persist the id of the transaction where an unlink operation
|
|
|
|
* against the inode was last made. So here we assume the inode might
|
|
|
|
* have been evicted, and therefore the exact value of last_unlink_trans
|
|
|
|
* lost, and set it to last_trans to avoid metadata inconsistencies
|
|
|
|
* between the inode and its parent if the inode is fsync'ed and the log
|
|
|
|
* replayed. For example, in the scenario:
|
|
|
|
*
|
|
|
|
* touch mydir/foo
|
|
|
|
* ln mydir/foo mydir/bar
|
|
|
|
* sync
|
|
|
|
* unlink mydir/bar
|
|
|
|
* echo 2 > /proc/sys/vm/drop_caches # evicts inode
|
|
|
|
* xfs_io -c fsync mydir/foo
|
|
|
|
* <power failure>
|
|
|
|
* mount fs, triggers fsync log replay
|
|
|
|
*
|
|
|
|
* We must make sure that when we fsync our inode foo we also log its
|
|
|
|
* parent inode, otherwise after log replay the parent still has the
|
|
|
|
* dentry with the "bar" name but our inode foo has a link count of 1
|
|
|
|
* and doesn't have an inode ref with the name "bar" anymore.
|
|
|
|
*
|
|
|
|
* Setting last_unlink_trans to last_trans is a pessimistic approach,
|
2016-05-20 01:18:45 +00:00
|
|
|
* but it guarantees correctness at the expense of occasional full
|
Btrfs: fix stale dir entries after unlink, inode eviction and fsync
If we remove a hard link from an inode, the inode gets evicted, then
we fsync the inode and then power fail/crash, when the log tree is
replayed, the parent directory inode still has entries pointing to
the name that no longer exists, while our inode no longer has the
BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected),
leaving the filesystem in an inconsistent state. The stale directory
entries can not be deleted (an attempt to delete them causes -ESTALE
errors), which makes it impossible to delete the parent directory.
This happens because we track the id of the transaction where the last
unlink operation for the inode happened (last_unlink_trans) in an
in-memory only field of the inode, that is, a value that is never
persisted in the inode item stored on the fs/subvol btree. So if an
inode is evicted and loaded again, the value for last_unlink_trans is
set to 0, which prevents the fsync from logging the parent directory
at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans
to the id of the transaction that last modified the inode when we
load the inode. This is a pessimistic approach but it always ensures
correctness with the trade off of ocassional full transaction commits
when an fsync is done against the inode in the same transaction where
it was evicted and reloaded when our inode is a directory and often
logging its parent unnecessarily when our inode is not a directory.
The following test case for fstests triggers the problem:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs generic
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file with 2 hard links.
mkdir $SCRATCH_MNT/testdir
touch $SCRATCH_MNT/testdir/foo
ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
# Make sure everything done so far is durably persisted.
sync
# Now remove one of the links, trigger inode eviction and then fsync
# our inode.
unlink $SCRATCH_MNT/testdir/bar
echo 2 > /proc/sys/vm/drop_caches
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo
# Silently drop all writes on our scratch device to simulate a power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again and mount the fs to trigger log/journal replay.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Now verify our directory entries.
echo "Entries in testdir:"
ls -1 $SCRATCH_MNT/testdir
# If we remove our inode, its parent should become empty and therefore we should
# be able to remove the parent.
rm -f $SCRATCH_MNT/testdir/*
rmdir $SCRATCH_MNT/testdir
_unmount_flakey
# The fstests framework will call fsck against our filesystem which will verify
# that all metadata is in a consistent state.
status=0
exit
The test failed on btrfs with:
generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad)
--- tests/generic/098.out 2015-07-23 18:01:12.616175932 +0100
+++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad 2015-07-23 18:04:58.924138308 +0100
@@ -1,3 +1,6 @@
QA output created by 098
Entries in testdir:
+bar
foo
+rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle
+rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
...
(Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad' to see the entire diff)
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full)
$ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full
(...)
checking fs roots
root 5 inode 258 errors 2001, no inode item, link count wrong
unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref
Checking filesystem on /dev/sdc
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-07-23 23:00:19 +00:00
|
|
|
* transaction commits on fsync if our inode is a directory, or if our
|
|
|
|
* inode is not a directory, logging its parent unnecessarily.
|
|
|
|
*/
|
|
|
|
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
|
|
|
|
|
2013-12-26 05:07:06 +00:00
|
|
|
path->slots[0]++;
|
|
|
|
if (inode->i_nlink != 1 ||
|
|
|
|
path->slots[0] >= btrfs_header_nritems(leaf))
|
|
|
|
goto cache_acl;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
|
2017-01-10 18:35:31 +00:00
|
|
|
if (location.objectid != btrfs_ino(BTRFS_I(inode)))
|
2013-12-26 05:07:06 +00:00
|
|
|
goto cache_acl;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
if (location.type == BTRFS_INODE_REF_KEY) {
|
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
|
|
|
|
ref = (struct btrfs_inode_ref *)ptr;
|
|
|
|
BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
|
|
|
|
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
|
|
|
|
extref = (struct btrfs_inode_extref *)ptr;
|
|
|
|
BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
|
|
|
|
extref);
|
|
|
|
}
|
2011-06-23 07:27:13 +00:00
|
|
|
cache_acl:
|
2009-04-27 15:47:50 +00:00
|
|
|
/*
|
|
|
|
* try to precache a NULL acl entry for files that don't have
|
|
|
|
* any xattrs or acls
|
|
|
|
*/
|
2011-04-20 02:31:50 +00:00
|
|
|
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
if (first_xattr_slot != -1) {
|
|
|
|
path->slots[0] = first_xattr_slot;
|
|
|
|
ret = btrfs_load_inode_props(inode, path);
|
|
|
|
if (ret)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_err(fs_info,
|
2014-05-15 14:48:20 +00:00
|
|
|
"error loading props for ino %llu (root %llu): %d",
|
2017-01-10 18:35:31 +00:00
|
|
|
btrfs_ino(BTRFS_I(inode)),
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
root->root_key.objectid, ret);
|
|
|
|
}
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2009-06-24 20:58:48 +00:00
|
|
|
if (!maybe_acls)
|
|
|
|
cache_no_acl(inode);
|
2009-04-27 15:47:50 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
switch (inode->i_mode & S_IFMT) {
|
|
|
|
case S_IFREG:
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2008-01-24 21:13:08 +00:00
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2007-06-12 10:35:45 +00:00
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
break;
|
|
|
|
case S_IFDIR:
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
2017-01-26 01:06:38 +00:00
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
2007-06-12 10:35:45 +00:00
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
2015-11-17 06:07:57 +00:00
|
|
|
inode_nohighmem(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
inode->i_mapping->a_ops = &btrfs_symlink_aops;
|
|
|
|
break;
|
2007-07-11 14:18:17 +00:00
|
|
|
default:
|
2009-02-04 14:29:13 +00:00
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
2007-07-11 14:18:17 +00:00
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
break;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2009-04-17 08:37:41 +00:00
|
|
|
|
2018-03-26 16:40:21 +00:00
|
|
|
btrfs_sync_inode_flags_to_i_flags(inode);
|
2016-06-06 10:51:25 +00:00
|
|
|
return 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
make_bad:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
make_bad_inode(inode);
|
2016-06-06 10:51:25 +00:00
|
|
|
return ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* given a leaf and an inode, copy the inode fields into the leaf
|
|
|
|
*/
|
2008-09-05 20:13:11 +00:00
|
|
|
static void fill_inode_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *leaf,
|
2007-10-15 20:14:19 +00:00
|
|
|
struct btrfs_inode_item *item,
|
2007-06-12 10:35:45 +00:00
|
|
|
struct inode *inode)
|
|
|
|
{
|
2012-12-27 09:01:21 +00:00
|
|
|
struct btrfs_map_token token;
|
|
|
|
|
|
|
|
btrfs_init_map_token(&token);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2012-12-27 09:01:21 +00:00
|
|
|
btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
|
|
|
|
btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
|
|
|
|
btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
|
|
|
|
&token);
|
|
|
|
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
|
|
|
|
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2014-12-12 16:39:12 +00:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->atime,
|
2012-12-27 09:01:21 +00:00
|
|
|
inode->i_atime.tv_sec, &token);
|
2014-12-12 16:39:12 +00:00
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->atime,
|
2012-12-27 09:01:21 +00:00
|
|
|
inode->i_atime.tv_nsec, &token);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2014-12-12 16:39:12 +00:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->mtime,
|
2012-12-27 09:01:21 +00:00
|
|
|
inode->i_mtime.tv_sec, &token);
|
2014-12-12 16:39:12 +00:00
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->mtime,
|
2012-12-27 09:01:21 +00:00
|
|
|
inode->i_mtime.tv_nsec, &token);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2014-12-12 16:39:12 +00:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->ctime,
|
2012-12-27 09:01:21 +00:00
|
|
|
inode->i_ctime.tv_sec, &token);
|
2014-12-12 16:39:12 +00:00
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->ctime,
|
2012-12-27 09:01:21 +00:00
|
|
|
inode->i_ctime.tv_nsec, &token);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2012-07-04 07:18:07 +00:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->otime,
|
|
|
|
BTRFS_I(inode)->i_otime.tv_sec, &token);
|
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->otime,
|
|
|
|
BTRFS_I(inode)->i_otime.tv_nsec, &token);
|
|
|
|
|
2012-12-27 09:01:21 +00:00
|
|
|
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
|
|
|
|
&token);
|
|
|
|
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
|
|
|
|
&token);
|
2017-12-11 11:35:12 +00:00
|
|
|
btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
|
|
|
|
&token);
|
2012-12-27 09:01:21 +00:00
|
|
|
btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
|
|
|
|
btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
|
|
|
|
btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
|
|
|
|
btrfs_set_token_inode_block_group(leaf, item, 0, &token);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
*/
|
2011-11-11 01:39:08 +00:00
|
|
|
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
|
2009-01-06 02:25:51 +00:00
|
|
|
struct btrfs_root *root, struct inode *inode)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
struct btrfs_path *path;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2009-03-13 15:00:37 +00:00
|
|
|
path->leave_spinning = 1;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
|
|
|
|
1);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (ret) {
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto failed;
|
|
|
|
}
|
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
struct btrfs_inode_item);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2008-09-05 20:13:11 +00:00
|
|
|
fill_inode_item(trans, leaf, inode_item, inode);
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-08-10 20:22:09 +00:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
ret = 0;
|
|
|
|
failed:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-11-11 01:39:08 +00:00
|
|
|
/*
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
*/
|
|
|
|
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root, struct inode *inode)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2011-11-11 01:39:08 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the inode is a free space inode, we can deadlock during commit
|
|
|
|
* if we put it into the delayed code.
|
|
|
|
*
|
|
|
|
* The data relocation inode should also be directly updated
|
|
|
|
* without delay
|
|
|
|
*/
|
2017-02-20 11:50:35 +00:00
|
|
|
if (!btrfs_is_free_space_inode(BTRFS_I(inode))
|
2014-09-18 15:30:44 +00:00
|
|
|
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
|
2016-06-22 22:54:23 +00:00
|
|
|
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
|
2012-07-25 15:35:53 +00:00
|
|
|
btrfs_update_root_times(trans, root);
|
|
|
|
|
2011-11-11 01:39:08 +00:00
|
|
|
ret = btrfs_delayed_update_inode(trans, root, inode);
|
|
|
|
if (!ret)
|
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return btrfs_update_inode_item(trans, root, inode);
|
|
|
|
}
|
|
|
|
|
2012-10-22 19:43:12 +00:00
|
|
|
noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *inode)
|
2011-11-11 01:39:08 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret == -ENOSPC)
|
|
|
|
return btrfs_update_inode_item(trans, root, inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* unlink helper that gets used here in inode.c and in the tree logging
|
|
|
|
* recovery code. It remove a link in a directory with a given name, and
|
|
|
|
* also drops the back refs in the inode to the directory
|
|
|
|
*/
|
2011-03-04 17:14:37 +00:00
|
|
|
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
2017-01-17 22:31:44 +00:00
|
|
|
struct btrfs_inode *dir,
|
|
|
|
struct btrfs_inode *inode,
|
2011-03-04 17:14:37 +00:00
|
|
|
const char *name, int name_len)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret = 0;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_dir_item *di;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct btrfs_key key;
|
2008-07-24 16:12:38 +00:00
|
|
|
u64 index;
|
2011-04-20 02:31:50 +00:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2007-06-22 18:16:25 +00:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
2011-02-03 03:16:25 +00:00
|
|
|
goto out;
|
2007-06-22 18:16:25 +00:00
|
|
|
}
|
|
|
|
|
2009-03-13 15:00:37 +00:00
|
|
|
path->leave_spinning = 1;
|
2011-04-20 02:31:50 +00:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2007-06-12 10:35:45 +00:00
|
|
|
name, name_len, -1);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
if (!di) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto err;
|
|
|
|
}
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
2007-06-12 10:35:45 +00:00
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2007-06-22 18:16:25 +00:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2013-12-26 05:07:06 +00:00
|
|
|
/*
|
|
|
|
* If we don't have dir index, we have to get it by looking up
|
|
|
|
* the inode ref, since we get the inode ref, remove it directly,
|
|
|
|
* it is unnecessary to do delayed deletion.
|
|
|
|
*
|
|
|
|
* But if we have dir index, needn't search inode ref to get it.
|
|
|
|
* Since the inode ref is close to the inode item, it is better
|
|
|
|
* that we delay to delete it, and just do this deletion when
|
|
|
|
* we update the inode item.
|
|
|
|
*/
|
2017-01-17 22:31:44 +00:00
|
|
|
if (inode->dir_index) {
|
2013-12-26 05:07:06 +00:00
|
|
|
ret = btrfs_delayed_delete_inode_ref(inode);
|
|
|
|
if (!ret) {
|
2017-01-17 22:31:44 +00:00
|
|
|
index = inode->dir_index;
|
2013-12-26 05:07:06 +00:00
|
|
|
goto skip_backref;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
dir_ino, &index);
|
2008-07-24 16:12:38 +00:00
|
|
|
if (ret) {
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_info(fs_info,
|
2013-03-19 22:41:23 +00:00
|
|
|
"failed to delete reference to %.*s, inode %llu parent %llu",
|
2013-08-20 11:20:07 +00:00
|
|
|
name_len, name, ino, dir_ino);
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2008-07-24 16:12:38 +00:00
|
|
|
goto err;
|
|
|
|
}
|
2013-12-26 05:07:06 +00:00
|
|
|
skip_backref:
|
2018-08-01 03:32:26 +00:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2007-06-12 10:35:45 +00:00
|
|
|
goto err;
|
2012-03-12 15:03:00 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
|
|
|
|
dir_ino);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret != 0 && ret != -ENOENT) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto err;
|
|
|
|
}
|
2008-09-05 20:13:11 +00:00
|
|
|
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
|
|
|
|
index);
|
2010-10-30 11:34:24 +00:00
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = 0;
|
2013-04-02 21:02:16 +00:00
|
|
|
else if (ret)
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2007-06-12 10:35:45 +00:00
|
|
|
err:
|
|
|
|
btrfs_free_path(path);
|
2008-09-05 20:13:11 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
|
2017-01-17 22:31:44 +00:00
|
|
|
inode_inc_iversion(&inode->vfs_inode);
|
|
|
|
inode_inc_iversion(&dir->vfs_inode);
|
|
|
|
inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
|
|
|
|
dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
|
|
|
|
ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
|
2008-09-05 20:13:11 +00:00
|
|
|
out:
|
2007-06-12 10:35:45 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-04 17:14:37 +00:00
|
|
|
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
2017-01-17 22:31:44 +00:00
|
|
|
struct btrfs_inode *dir, struct btrfs_inode *inode,
|
2011-03-04 17:14:37 +00:00
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
|
|
|
|
if (!ret) {
|
2017-01-17 22:31:44 +00:00
|
|
|
drop_nlink(&inode->vfs_inode);
|
|
|
|
ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
|
2011-03-04 17:14:37 +00:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2010-05-16 14:48:46 +00:00
|
|
|
/*
|
|
|
|
* helper to start transaction for unlink and rmdir.
|
|
|
|
*
|
2013-05-29 18:54:47 +00:00
|
|
|
* unlink and rmdir are special in btrfs, they do not always free space, so
|
|
|
|
* if we cannot make our reservations the normal way try and see if there is
|
|
|
|
* plenty of slack room in the global reserve to migrate, otherwise we cannot
|
|
|
|
* allow the unlink to occur.
|
2010-05-16 14:48:46 +00:00
|
|
|
*/
|
2013-05-29 18:54:47 +00:00
|
|
|
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
|
2009-09-21 19:56:00 +00:00
|
|
|
{
|
2010-05-16 14:48:46 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2009-09-21 19:56:00 +00:00
|
|
|
|
2011-10-11 18:18:24 +00:00
|
|
|
/*
|
|
|
|
* 1 for the possible orphan item
|
|
|
|
* 1 for the dir item
|
|
|
|
* 1 for the dir index
|
|
|
|
* 1 for the inode ref
|
|
|
|
* 1 for the inode
|
|
|
|
*/
|
2015-11-13 23:57:16 +00:00
|
|
|
return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
|
2010-05-16 14:48:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2010-05-16 14:48:46 +00:00
|
|
|
int ret;
|
|
|
|
|
2013-05-29 18:54:47 +00:00
|
|
|
trans = __unlink_start_trans(dir);
|
2010-05-16 14:48:46 +00:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2017-01-17 22:31:44 +00:00
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
|
|
|
|
0);
|
2009-03-24 14:24:20 +00:00
|
|
|
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
|
|
|
|
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
|
|
|
|
dentry->d_name.len);
|
2011-07-19 07:27:20 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2010-05-16 14:48:46 +00:00
|
|
|
if (inode->i_nlink == 0) {
|
2017-02-20 11:50:59 +00:00
|
|
|
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
2011-07-19 07:27:20 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2010-05-16 14:48:46 +00:00
|
|
|
}
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2011-07-19 07:27:20 +00:00
|
|
|
out:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(root->fs_info);
|
2007-06-12 10:35:45 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-04-18 02:34:52 +00:00
|
|
|
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
|
2018-08-01 03:32:30 +00:00
|
|
|
struct inode *dir, u64 objectid,
|
|
|
|
const char *name, int name_len)
|
2009-09-21 19:56:00 +00:00
|
|
|
{
|
2018-08-01 03:32:30 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2009-09-21 19:56:00 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 index;
|
|
|
|
int ret;
|
2017-01-10 18:35:31 +00:00
|
|
|
u64 dir_ino = btrfs_ino(BTRFS_I(dir));
|
2009-09-21 19:56:00 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2009-09-21 19:56:00 +00:00
|
|
|
name, name_len, -1);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
if (!di)
|
|
|
|
ret = -ENOENT;
|
|
|
|
else
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
2009-09-21 19:56:00 +00:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
|
|
|
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
|
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 19:56:00 +00:00
|
|
|
|
2018-08-01 03:32:28 +00:00
|
|
|
ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
|
|
|
|
dir_ino, &index, name, name_len);
|
2009-09-21 19:56:00 +00:00
|
|
|
if (ret < 0) {
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret != -ENOENT) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2011-04-20 02:31:50 +00:00
|
|
|
di = btrfs_search_dir_index_item(root, path, dir_ino,
|
2009-09-21 19:56:00 +00:00
|
|
|
name, name_len);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
if (!di)
|
|
|
|
ret = -ENOENT;
|
|
|
|
else
|
|
|
|
ret = PTR_ERR(di);
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2009-09-21 19:56:00 +00:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 19:56:00 +00:00
|
|
|
index = key.offset;
|
|
|
|
}
|
2011-05-22 16:33:42 +00:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 19:56:00 +00:00
|
|
|
|
2018-08-01 03:32:26 +00:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2009-09-21 19:56:00 +00:00
|
|
|
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
|
2012-04-05 19:03:02 +00:00
|
|
|
inode_inc_iversion(dir);
|
2016-09-14 14:48:06 +00:00
|
|
|
dir->i_mtime = dir->i_ctime = current_time(dir);
|
2012-08-08 16:12:59 +00:00
|
|
|
ret = btrfs_update_inode_fallback(trans, root, dir);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret)
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
out:
|
2011-06-14 18:24:32 +00:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 15:03:00 +00:00
|
|
|
return ret;
|
2009-09-21 19:56:00 +00:00
|
|
|
}
|
|
|
|
|
2018-04-18 02:34:13 +00:00
|
|
|
/*
|
|
|
|
* Helper to check if the subvolume references other subvolumes or if it's
|
|
|
|
* default.
|
|
|
|
*/
|
2018-04-18 02:34:52 +00:00
|
|
|
static noinline int may_destroy_subvol(struct btrfs_root *root)
|
2018-04-18 02:34:13 +00:00
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 dir_id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Make sure this root isn't set as the default subvol */
|
|
|
|
dir_id = btrfs_super_root_dir(fs_info->super_copy);
|
|
|
|
di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
|
|
|
|
dir_id, "default", 7, 0);
|
|
|
|
if (di && !IS_ERR(di)) {
|
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
|
|
|
|
if (key.objectid == root->root_key.objectid) {
|
|
|
|
ret = -EPERM;
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"deleting default subvolume %llu is not allowed",
|
|
|
|
key.objectid);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
btrfs_release_path(path);
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = root->root_key.objectid;
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
BUG_ON(ret == 0);
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
if (path->slots[0] > 0) {
|
|
|
|
path->slots[0]--;
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
|
|
|
if (key.objectid == root->root_key.objectid &&
|
|
|
|
key.type == BTRFS_ROOT_REF_KEY)
|
|
|
|
ret = -ENOTEMPTY;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-04-27 11:36:24 +00:00
|
|
|
/* Delete all dentries for inodes belonging to the root */
|
|
|
|
static void btrfs_prune_dentries(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct rb_node *node;
|
|
|
|
struct rb_node *prev;
|
|
|
|
struct btrfs_inode *entry;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 objectid = 0;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
|
|
|
|
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
|
|
|
|
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
again:
|
|
|
|
node = root->inode_tree.rb_node;
|
|
|
|
prev = NULL;
|
|
|
|
while (node) {
|
|
|
|
prev = node;
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
|
|
|
|
2018-06-29 08:56:40 +00:00
|
|
|
if (objectid < btrfs_ino(entry))
|
2018-04-27 11:36:24 +00:00
|
|
|
node = node->rb_left;
|
2018-06-29 08:56:40 +00:00
|
|
|
else if (objectid > btrfs_ino(entry))
|
2018-04-27 11:36:24 +00:00
|
|
|
node = node->rb_right;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!node) {
|
|
|
|
while (prev) {
|
|
|
|
entry = rb_entry(prev, struct btrfs_inode, rb_node);
|
2018-06-29 08:56:40 +00:00
|
|
|
if (objectid <= btrfs_ino(entry)) {
|
2018-04-27 11:36:24 +00:00
|
|
|
node = prev;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
prev = rb_next(prev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (node) {
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
2018-06-29 08:56:40 +00:00
|
|
|
objectid = btrfs_ino(entry) + 1;
|
2018-04-27 11:36:24 +00:00
|
|
|
inode = igrab(&entry->vfs_inode);
|
|
|
|
if (inode) {
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
if (atomic_read(&inode->i_count) > 1)
|
|
|
|
d_prune_aliases(inode);
|
|
|
|
/*
|
|
|
|
* btrfs_drop_inode will have it removed from the inode
|
|
|
|
* cache when its usage count hits zero.
|
|
|
|
*/
|
|
|
|
iput(inode);
|
|
|
|
cond_resched();
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cond_resched_lock(&root->inode_lock))
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
node = rb_next(node);
|
|
|
|
}
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
}
|
|
|
|
|
2018-04-18 02:34:52 +00:00
|
|
|
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct inode *inode = d_inode(dentry);
|
|
|
|
struct btrfs_root *dest = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_block_rsv block_rsv;
|
|
|
|
u64 root_flags;
|
|
|
|
int ret;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't allow to delete a subvolume with send in progress. This is
|
|
|
|
* inside the inode lock so the error handling that has to drop the bit
|
|
|
|
* again is not run concurrently.
|
|
|
|
*/
|
|
|
|
spin_lock(&dest->root_item_lock);
|
|
|
|
root_flags = btrfs_root_flags(&dest->root_item);
|
|
|
|
if (dest->send_in_progress == 0) {
|
|
|
|
btrfs_set_root_flags(&dest->root_item,
|
|
|
|
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
|
|
|
|
spin_unlock(&dest->root_item_lock);
|
|
|
|
} else {
|
|
|
|
spin_unlock(&dest->root_item_lock);
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"attempt to delete subvolume %llu during send",
|
|
|
|
dest->root_key.objectid);
|
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
|
|
|
|
down_write(&fs_info->subvol_sem);
|
|
|
|
|
|
|
|
err = may_destroy_subvol(dest);
|
|
|
|
if (err)
|
|
|
|
goto out_up_write;
|
|
|
|
|
|
|
|
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
|
|
|
|
/*
|
|
|
|
* One for dir inode,
|
|
|
|
* two for dir entries,
|
|
|
|
* two for root ref/backref.
|
|
|
|
*/
|
2018-05-30 03:00:38 +00:00
|
|
|
err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
|
2018-04-18 02:34:52 +00:00
|
|
|
if (err)
|
|
|
|
goto out_up_write;
|
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out_release;
|
|
|
|
}
|
|
|
|
trans->block_rsv = &block_rsv;
|
|
|
|
trans->bytes_reserved = block_rsv.size;
|
|
|
|
|
|
|
|
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
|
|
|
|
|
2018-08-01 03:32:30 +00:00
|
|
|
ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid,
|
|
|
|
dentry->d_name.name, dentry->d_name.len);
|
2018-04-18 02:34:52 +00:00
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto out_end_trans;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_record_root_in_trans(trans, dest);
|
|
|
|
|
|
|
|
memset(&dest->root_item.drop_progress, 0,
|
|
|
|
sizeof(dest->root_item.drop_progress));
|
|
|
|
dest->root_item.drop_level = 0;
|
|
|
|
btrfs_set_root_refs(&dest->root_item, 0);
|
|
|
|
|
|
|
|
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
|
|
|
|
ret = btrfs_insert_orphan_item(trans,
|
|
|
|
fs_info->tree_root,
|
|
|
|
dest->root_key.objectid);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
err = ret;
|
|
|
|
goto out_end_trans;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-29 07:01:54 +00:00
|
|
|
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
|
2018-04-18 02:34:52 +00:00
|
|
|
BTRFS_UUID_KEY_SUBVOL,
|
|
|
|
dest->root_key.objectid);
|
|
|
|
if (ret && ret != -ENOENT) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
err = ret;
|
|
|
|
goto out_end_trans;
|
|
|
|
}
|
|
|
|
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
|
2018-05-29 07:01:54 +00:00
|
|
|
ret = btrfs_uuid_tree_remove(trans,
|
2018-04-18 02:34:52 +00:00
|
|
|
dest->root_item.received_uuid,
|
|
|
|
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
|
|
|
|
dest->root_key.objectid);
|
|
|
|
if (ret && ret != -ENOENT) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
err = ret;
|
|
|
|
goto out_end_trans;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out_end_trans:
|
|
|
|
trans->block_rsv = NULL;
|
|
|
|
trans->bytes_reserved = 0;
|
|
|
|
ret = btrfs_end_transaction(trans);
|
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
|
|
|
inode->i_flags |= S_DEAD;
|
|
|
|
out_release:
|
|
|
|
btrfs_subvolume_release_metadata(fs_info, &block_rsv);
|
|
|
|
out_up_write:
|
|
|
|
up_write(&fs_info->subvol_sem);
|
|
|
|
if (err) {
|
|
|
|
spin_lock(&dest->root_item_lock);
|
|
|
|
root_flags = btrfs_root_flags(&dest->root_item);
|
|
|
|
btrfs_set_root_flags(&dest->root_item,
|
|
|
|
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
|
|
|
|
spin_unlock(&dest->root_item_lock);
|
|
|
|
} else {
|
|
|
|
d_invalidate(dentry);
|
2018-04-27 11:36:24 +00:00
|
|
|
btrfs_prune_dentries(dest);
|
2018-04-18 02:34:52 +00:00
|
|
|
ASSERT(dest->send_in_progress == 0);
|
|
|
|
|
|
|
|
/* the last ref */
|
|
|
|
if (dest->ino_cache_inode) {
|
|
|
|
iput(dest->ino_cache_inode);
|
|
|
|
dest->ino_cache_inode = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2007-12-21 21:27:21 +00:00
|
|
|
int err = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2016-06-06 15:11:13 +00:00
|
|
|
u64 last_unlink_trans;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2012-09-13 22:04:34 +00:00
|
|
|
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
2007-10-25 19:49:25 +00:00
|
|
|
return -ENOTEMPTY;
|
2017-01-10 18:35:31 +00:00
|
|
|
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
|
2018-04-18 02:35:31 +00:00
|
|
|
return btrfs_delete_subvolume(dir, dentry);
|
2007-10-25 19:49:25 +00:00
|
|
|
|
2013-05-29 18:54:47 +00:00
|
|
|
trans = __unlink_start_trans(dir);
|
2010-05-16 14:48:46 +00:00
|
|
|
if (IS_ERR(trans))
|
2009-11-11 02:23:48 +00:00
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
2017-01-10 18:35:31 +00:00
|
|
|
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
2018-08-01 03:32:30 +00:00
|
|
|
err = btrfs_unlink_subvol(trans, dir,
|
2009-09-21 19:56:00 +00:00
|
|
|
BTRFS_I(inode)->location.objectid,
|
|
|
|
dentry->d_name.name,
|
|
|
|
dentry->d_name.len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:50:59 +00:00
|
|
|
err = btrfs_orphan_add(trans, BTRFS_I(inode));
|
2008-07-24 16:17:14 +00:00
|
|
|
if (err)
|
2009-09-21 19:56:00 +00:00
|
|
|
goto out;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2016-06-06 15:11:13 +00:00
|
|
|
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
/* now the directory is empty */
|
2017-01-17 22:31:44 +00:00
|
|
|
err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
|
|
|
|
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
|
|
|
|
dentry->d_name.len);
|
2016-06-06 15:11:13 +00:00
|
|
|
if (!err) {
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), 0);
|
2016-06-06 15:11:13 +00:00
|
|
|
/*
|
|
|
|
* Propagate the last_unlink_trans value of the deleted dir to
|
|
|
|
* its parent directory. This is to prevent an unrecoverable
|
|
|
|
* log tree in the case we do something like this:
|
|
|
|
* 1) create dir foo
|
|
|
|
* 2) create snapshot under dir foo
|
|
|
|
* 3) delete the snapshot
|
|
|
|
* 4) rmdir foo
|
|
|
|
* 5) mkdir foo
|
|
|
|
* 6) fsync foo or some file inside foo
|
|
|
|
*/
|
|
|
|
if (last_unlink_trans >= trans->transid)
|
|
|
|
BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
|
|
|
|
}
|
2009-09-21 19:56:00 +00:00
|
|
|
out:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(root->fs_info);
|
2007-12-12 19:38:19 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-02-04 14:59:29 +00:00
|
|
|
static int truncate_space_check(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
u64 bytes_deleted)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2015-02-04 14:59:29 +00:00
|
|
|
int ret;
|
|
|
|
|
2016-01-13 16:48:06 +00:00
|
|
|
/*
|
|
|
|
* This is only used to apply pressure to the enospc system, we don't
|
|
|
|
* intend to use this reservation at all.
|
|
|
|
*/
|
2016-06-22 22:54:24 +00:00
|
|
|
bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
|
2016-06-22 22:54:23 +00:00
|
|
|
bytes_deleted *= fs_info->nodesize;
|
|
|
|
ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
|
2015-02-04 14:59:29 +00:00
|
|
|
bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
|
2016-01-13 16:48:06 +00:00
|
|
|
if (!ret) {
|
2016-06-22 22:54:23 +00:00
|
|
|
trace_btrfs_space_reservation(fs_info, "transaction",
|
2016-01-13 16:48:06 +00:00
|
|
|
trans->transid,
|
|
|
|
bytes_deleted, 1);
|
2015-02-04 14:59:29 +00:00
|
|
|
trans->bytes_reserved += bytes_deleted;
|
2016-01-13 16:48:06 +00:00
|
|
|
}
|
2015-02-04 14:59:29 +00:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2017-10-19 18:16:02 +00:00
|
|
|
/*
|
|
|
|
* Return this if we need to call truncate_block for the last bit of the
|
|
|
|
* truncate.
|
|
|
|
*/
|
|
|
|
#define NEED_TRUNCATE_BLOCK 1
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 11:34:25 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
/*
|
|
|
|
* this can truncate away extent items, csum items and directory items.
|
|
|
|
* It starts at a high offset and removes keys until it can't find
|
2008-09-29 19:18:18 +00:00
|
|
|
* any higher than new_size
|
2007-06-12 10:35:45 +00:00
|
|
|
*
|
|
|
|
* csum items that cross the new i_size are truncated to the new size
|
|
|
|
* as well.
|
2008-07-24 16:17:14 +00:00
|
|
|
*
|
|
|
|
* min_type is the minimum key type to truncate down to. If set to 0, this
|
|
|
|
* will kill all the items on this inode, including the INODE_ITEM_KEY.
|
2007-06-12 10:35:45 +00:00
|
|
|
*/
|
2009-11-12 09:35:36 +00:00
|
|
|
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *inode,
|
|
|
|
u64 new_size, u32 min_type)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_path *path;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_file_extent_item *fi;
|
2009-11-12 09:35:36 +00:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
2007-06-12 10:35:45 +00:00
|
|
|
u64 extent_start = 0;
|
2007-10-15 20:15:53 +00:00
|
|
|
u64 extent_num_bytes = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
u64 extent_offset = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
u64 item_end = 0;
|
Btrfs: fix shrinking truncate when the no_holes feature is enabled
If the no_holes feature is enabled, we attempt to shrink a file to a size
that ends up in the middle of a hole and we don't have any file extent
items in the fs/subvol tree that go beyond the new file size (or any
ordered extents that will insert such file extent items), we end up not
updating the inode's disk_i_size, we only update the inode's i_size.
This means that after unmounting and mounting the filesystem, or after
the inode is evicted and reloaded, its i_size ends up being incorrect
(an inode's i_size is set to the disk_i_size field when an inode is
loaded). This happens when btrfs_truncate_inode_items() doesn't find
any file extent items to drop - in this case it never makes a call to
btrfs_ordered_update_i_size() in order to update the inode's disk_i_size.
Example reproducer:
$ mkfs.btrfs -O no-holes -f /dev/sdd
$ mount /dev/sdd /mnt
# Create our test file with some data and durably persist it.
$ xfs_io -f -c "pwrite -S 0xaa 0 128K" /mnt/foo
$ sync
# Append some data to the file, increasing its size, and leave a hole
# between the old size and the start offset if the following write. So
# our file gets a hole in the range [128Kb, 256Kb[.
$ xfs_io -c "truncate 160K" /mnt/foo
# We expect to see our file with a size of 160Kb, with the first 128Kb
# of data all having the value 0xaa and the remaining 32Kb of data all
# having the value 0x00.
$ od -t x1 /mnt/foo
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0400000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0500000
# Now cleanly unmount and mount again the filesystem.
$ umount /mnt
$ mount /dev/sdd /mnt
# We expect to get the same result as before, a file with a size of
# 160Kb, with the first 128Kb of data all having the value 0xaa and the
# remaining 32Kb of data all having the value 0x00.
$ od -t x1 /mnt/foo
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0400000
In the example above the file size/data do not match what they were before
the remount.
Fix this by always calling btrfs_ordered_update_i_size() with a size
matching the size the file was truncated to if btrfs_truncate_inode_items()
is not called for a log tree and no file extent items were dropped. This
ensures the same behaviour as when the no_holes feature is not enabled.
A test case for fstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-20 17:20:09 +00:00
|
|
|
u64 last_size = new_size;
|
2009-11-12 09:35:36 +00:00
|
|
|
u32 found_type = (u8)-1;
|
2007-06-12 10:35:45 +00:00
|
|
|
int found_extent;
|
|
|
|
int del_item;
|
2008-01-29 20:11:36 +00:00
|
|
|
int pending_del_nr = 0;
|
|
|
|
int pending_del_slot = 0;
|
2007-11-01 15:28:41 +00:00
|
|
|
int extent_type = -1;
|
2009-11-12 09:35:36 +00:00
|
|
|
int ret;
|
2017-01-10 18:35:31 +00:00
|
|
|
u64 ino = btrfs_ino(BTRFS_I(inode));
|
2014-12-17 17:41:04 +00:00
|
|
|
u64 bytes_deleted = 0;
|
2017-10-07 14:02:21 +00:00
|
|
|
bool be_nice = false;
|
|
|
|
bool should_throttle = false;
|
|
|
|
bool should_end = false;
|
2009-11-12 09:35:36 +00:00
|
|
|
|
|
|
|
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2014-12-17 17:41:04 +00:00
|
|
|
/*
|
|
|
|
* for non-free space inodes and ref cows, we want to back off from
|
|
|
|
* time to time
|
|
|
|
*/
|
2017-02-20 11:50:35 +00:00
|
|
|
if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
|
2014-12-17 17:41:04 +00:00
|
|
|
test_bit(BTRFS_ROOT_REF_COWS, &root->state))
|
2017-10-07 14:02:21 +00:00
|
|
|
be_nice = true;
|
2014-12-17 17:41:04 +00:00
|
|
|
|
2011-07-12 23:44:10 +00:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2015-11-27 15:31:35 +00:00
|
|
|
path->reada = READA_BACK;
|
2011-07-12 23:44:10 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
/*
|
|
|
|
* We want to drop from the next block forward in case this new size is
|
|
|
|
* not block aligned since we will be keeping the last block of the
|
|
|
|
* extent just the way it is.
|
|
|
|
*/
|
2014-04-02 11:51:05 +00:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
|
2016-06-22 22:54:23 +00:00
|
|
|
root == fs_info->tree_root)
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize),
|
2016-06-15 13:22:56 +00:00
|
|
|
(u64)-1, 0);
|
2009-11-12 09:35:36 +00:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
/*
|
|
|
|
* This function is also used to drop the items in the log tree before
|
|
|
|
* we relog the inode, so if root != BTRFS_I(inode)->root, it means
|
|
|
|
* it is used to drop the loged items. So we shouldn't kill the delayed
|
|
|
|
* items.
|
|
|
|
*/
|
|
|
|
if (min_type == 0 && root == BTRFS_I(inode)->root)
|
2017-01-10 18:35:38 +00:00
|
|
|
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
key.objectid = ino;
|
2007-06-12 10:35:45 +00:00
|
|
|
key.offset = (u64)-1;
|
2007-10-15 20:14:19 +00:00
|
|
|
key.type = (u8)-1;
|
|
|
|
|
2008-01-29 20:11:36 +00:00
|
|
|
search_again:
|
2014-12-17 17:41:04 +00:00
|
|
|
/*
|
|
|
|
* with a 16K leaf size and 128MB extents, you can actually queue
|
|
|
|
* up a huge file in a single leaf. Most of the time that
|
|
|
|
* bytes_deleted is > 0, it will be huge by the time we get here
|
|
|
|
*/
|
2018-05-11 20:13:30 +00:00
|
|
|
if (be_nice && bytes_deleted > SZ_32M &&
|
|
|
|
btrfs_should_end_transaction(trans)) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto out;
|
2014-12-17 17:41:04 +00:00
|
|
|
}
|
|
|
|
|
2009-03-13 15:00:37 +00:00
|
|
|
path->leave_spinning = 1;
|
2008-01-29 20:11:36 +00:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
2018-05-11 20:13:30 +00:00
|
|
|
if (ret < 0)
|
2009-11-12 09:35:36 +00:00
|
|
|
goto out;
|
2009-01-06 02:25:51 +00:00
|
|
|
|
2008-01-29 20:11:36 +00:00
|
|
|
if (ret > 0) {
|
2018-05-11 20:13:30 +00:00
|
|
|
ret = 0;
|
2008-09-05 20:13:11 +00:00
|
|
|
/* there are no items in the tree for us to truncate, we're
|
|
|
|
* done
|
|
|
|
*/
|
2009-11-12 09:35:36 +00:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
goto out;
|
2008-01-29 20:11:36 +00:00
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
2009-01-06 02:25:51 +00:00
|
|
|
while (1) {
|
2007-06-12 10:35:45 +00:00
|
|
|
fi = NULL;
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
2014-06-04 16:41:45 +00:00
|
|
|
found_type = found_key.type;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
if (found_key.objectid != ino)
|
2007-06-12 10:35:45 +00:00
|
|
|
break;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2008-01-29 20:11:36 +00:00
|
|
|
if (found_type < min_type)
|
2007-06-12 10:35:45 +00:00
|
|
|
break;
|
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
item_end = found_key.offset;
|
2007-06-12 10:35:45 +00:00
|
|
|
if (found_type == BTRFS_EXTENT_DATA_KEY) {
|
2007-10-15 20:14:19 +00:00
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_file_extent_item);
|
2007-11-01 15:28:41 +00:00
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-15 20:14:19 +00:00
|
|
|
item_end +=
|
2007-10-15 20:15:53 +00:00
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2017-03-10 19:09:48 +00:00
|
|
|
|
|
|
|
trace_btrfs_truncate_show_fi_regular(
|
|
|
|
BTRFS_I(inode), leaf, fi,
|
|
|
|
found_key.offset);
|
2007-11-01 15:28:41 +00:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
2018-06-06 07:41:49 +00:00
|
|
|
item_end += btrfs_file_extent_ram_bytes(leaf,
|
|
|
|
fi);
|
2017-03-10 19:09:48 +00:00
|
|
|
|
|
|
|
trace_btrfs_truncate_show_fi_inline(
|
|
|
|
BTRFS_I(inode), leaf, fi, path->slots[0],
|
|
|
|
found_key.offset);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2007-11-07 18:31:09 +00:00
|
|
|
item_end--;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
if (found_type > min_type) {
|
|
|
|
del_item = 1;
|
|
|
|
} else {
|
2017-02-14 16:56:01 +00:00
|
|
|
if (item_end < new_size)
|
2007-08-27 20:49:44 +00:00
|
|
|
break;
|
2009-11-12 09:35:36 +00:00
|
|
|
if (found_key.offset >= new_size)
|
|
|
|
del_item = 1;
|
|
|
|
else
|
|
|
|
del_item = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
found_extent = 0;
|
|
|
|
/* FIXME, shrink the extent if the ref count is only 1 */
|
2007-11-01 15:28:41 +00:00
|
|
|
if (found_type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto delete;
|
|
|
|
|
|
|
|
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
|
2007-06-12 10:35:45 +00:00
|
|
|
u64 num_dec;
|
2007-10-15 20:15:53 +00:00
|
|
|
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
|
2012-01-13 00:10:12 +00:00
|
|
|
if (!del_item) {
|
2007-10-15 20:15:53 +00:00
|
|
|
u64 orig_num_bytes =
|
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2013-02-26 08:10:22 +00:00
|
|
|
extent_num_bytes = ALIGN(new_size -
|
|
|
|
found_key.offset,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize);
|
2007-10-15 20:15:53 +00:00
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi,
|
|
|
|
extent_num_bytes);
|
|
|
|
num_dec = (orig_num_bytes -
|
2008-02-08 18:49:28 +00:00
|
|
|
extent_num_bytes);
|
2014-04-02 11:51:05 +00:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS,
|
|
|
|
&root->state) &&
|
|
|
|
extent_start != 0)
|
2008-10-09 15:46:29 +00:00
|
|
|
inode_sub_bytes(inode, num_dec);
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 10:35:45 +00:00
|
|
|
} else {
|
2007-10-15 20:15:53 +00:00
|
|
|
extent_num_bytes =
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf,
|
|
|
|
fi);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
extent_offset = found_key.offset -
|
|
|
|
btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
/* FIXME blocksize != 4096 */
|
2008-02-08 18:49:28 +00:00
|
|
|
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (extent_start != 0) {
|
|
|
|
found_extent = 1;
|
2014-04-02 11:51:05 +00:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS,
|
|
|
|
&root->state))
|
2008-10-09 15:46:29 +00:00
|
|
|
inode_sub_bytes(inode, num_dec);
|
2008-09-05 20:13:11 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2008-02-08 18:49:28 +00:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
/*
|
|
|
|
* we can't truncate inline items that have had
|
|
|
|
* special encodings
|
|
|
|
*/
|
|
|
|
if (!del_item &&
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) == 0 &&
|
2017-10-19 18:16:02 +00:00
|
|
|
btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
|
|
|
|
btrfs_file_extent_compression(leaf, fi) == 0) {
|
|
|
|
u32 size = (u32)(new_size - found_key.offset);
|
|
|
|
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
|
|
|
|
size = btrfs_file_extent_calc_inline_size(size);
|
|
|
|
btrfs_truncate_item(root->fs_info, path, size, 1);
|
|
|
|
} else if (!del_item) {
|
2014-01-04 05:07:00 +00:00
|
|
|
/*
|
2017-10-19 18:16:02 +00:00
|
|
|
* We have to bail so the last_size is set to
|
|
|
|
* just before this extent.
|
2014-01-04 05:07:00 +00:00
|
|
|
*/
|
2018-05-11 20:13:30 +00:00
|
|
|
ret = NEED_TRUNCATE_BLOCK;
|
2017-10-19 18:16:02 +00:00
|
|
|
break;
|
|
|
|
}
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 11:34:25 +00:00
|
|
|
|
2017-10-19 18:16:02 +00:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 11:34:25 +00:00
|
|
|
inode_sub_bytes(inode, item_end + 1 - new_size);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2007-11-01 15:28:41 +00:00
|
|
|
delete:
|
2017-10-19 18:16:02 +00:00
|
|
|
if (del_item)
|
|
|
|
last_size = found_key.offset;
|
|
|
|
else
|
|
|
|
last_size = new_size;
|
2007-06-12 10:35:45 +00:00
|
|
|
if (del_item) {
|
2008-01-29 20:11:36 +00:00
|
|
|
if (!pending_del_nr) {
|
|
|
|
/* no pending yet, add ourselves */
|
|
|
|
pending_del_slot = path->slots[0];
|
|
|
|
pending_del_nr = 1;
|
|
|
|
} else if (pending_del_nr &&
|
|
|
|
path->slots[0] + 1 == pending_del_slot) {
|
|
|
|
/* hop on the pending chunk */
|
|
|
|
pending_del_nr++;
|
|
|
|
pending_del_slot = path->slots[0];
|
|
|
|
} else {
|
2009-01-06 02:25:51 +00:00
|
|
|
BUG();
|
2008-01-29 20:11:36 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2017-10-07 14:02:21 +00:00
|
|
|
should_throttle = false;
|
2015-02-04 14:59:29 +00:00
|
|
|
|
2014-04-02 11:51:05 +00:00
|
|
|
if (found_extent &&
|
|
|
|
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
|
2016-06-22 22:54:23 +00:00
|
|
|
root == fs_info->tree_root)) {
|
2009-03-13 15:00:37 +00:00
|
|
|
btrfs_set_path_blocking(path);
|
2014-12-17 17:41:04 +00:00
|
|
|
bytes_deleted += extent_num_bytes;
|
2017-09-29 19:43:49 +00:00
|
|
|
ret = btrfs_free_extent(trans, root, extent_start,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
extent_num_bytes, 0,
|
|
|
|
btrfs_header_owner(leaf),
|
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 06:52:54 +00:00
|
|
|
ino, extent_offset);
|
2018-05-11 20:13:31 +00:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
break;
|
|
|
|
}
|
2016-06-22 22:54:24 +00:00
|
|
|
if (btrfs_should_throttle_delayed_refs(trans, fs_info))
|
|
|
|
btrfs_async_run_delayed_refs(fs_info,
|
2016-10-18 07:56:13 +00:00
|
|
|
trans->delayed_ref_updates * 2,
|
|
|
|
trans->transid, 0);
|
2015-02-04 14:59:29 +00:00
|
|
|
if (be_nice) {
|
|
|
|
if (truncate_space_check(trans, root,
|
|
|
|
extent_num_bytes)) {
|
2017-10-07 14:02:21 +00:00
|
|
|
should_end = true;
|
2015-02-04 14:59:29 +00:00
|
|
|
}
|
|
|
|
if (btrfs_should_throttle_delayed_refs(trans,
|
2016-06-22 22:54:24 +00:00
|
|
|
fs_info))
|
2017-10-07 14:02:21 +00:00
|
|
|
should_throttle = true;
|
2015-02-04 14:59:29 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2008-01-29 20:11:36 +00:00
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
if (found_type == BTRFS_INODE_ITEM_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (path->slots[0] == 0 ||
|
2015-02-03 15:50:16 +00:00
|
|
|
path->slots[0] != pending_del_slot ||
|
2015-02-04 14:59:29 +00:00
|
|
|
should_throttle || should_end) {
|
2009-11-12 09:35:36 +00:00
|
|
|
if (pending_del_nr) {
|
|
|
|
ret = btrfs_del_items(trans, root, path,
|
|
|
|
pending_del_slot,
|
|
|
|
pending_del_nr);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2018-05-11 20:13:30 +00:00
|
|
|
break;
|
2012-03-12 15:03:00 +00:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
pending_del_nr = 0;
|
|
|
|
}
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2015-02-04 14:59:29 +00:00
|
|
|
if (should_throttle) {
|
2015-02-03 15:50:16 +00:00
|
|
|
unsigned long updates = trans->delayed_ref_updates;
|
|
|
|
if (updates) {
|
|
|
|
trans->delayed_ref_updates = 0;
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_run_delayed_refs(trans,
|
|
|
|
updates * 2);
|
2018-05-11 20:13:30 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
2015-02-03 15:50:16 +00:00
|
|
|
}
|
|
|
|
}
|
2015-02-04 14:59:29 +00:00
|
|
|
/*
|
|
|
|
* if we failed to refill our space rsv, bail out
|
|
|
|
* and let the transaction restart
|
|
|
|
*/
|
|
|
|
if (should_end) {
|
2018-05-11 20:13:30 +00:00
|
|
|
ret = -EAGAIN;
|
|
|
|
break;
|
2015-02-04 14:59:29 +00:00
|
|
|
}
|
2008-01-29 20:11:36 +00:00
|
|
|
goto search_again;
|
2009-11-12 09:35:36 +00:00
|
|
|
} else {
|
|
|
|
path->slots[0]--;
|
2008-01-29 20:11:36 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
out:
|
2018-05-11 20:13:30 +00:00
|
|
|
if (ret >= 0 && pending_del_nr) {
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = btrfs_del_items(trans, root, path, pending_del_slot,
|
2008-01-29 20:11:36 +00:00
|
|
|
pending_del_nr);
|
2018-05-11 20:13:30 +00:00
|
|
|
if (err) {
|
|
|
|
btrfs_abort_transaction(trans, err);
|
|
|
|
ret = err;
|
|
|
|
}
|
2008-01-29 20:11:36 +00:00
|
|
|
}
|
2017-02-14 16:56:01 +00:00
|
|
|
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
|
|
|
|
ASSERT(last_size >= new_size);
|
2018-05-11 20:13:30 +00:00
|
|
|
if (!ret && last_size > new_size)
|
2017-02-14 16:56:01 +00:00
|
|
|
last_size = new_size;
|
2013-08-29 20:43:28 +00:00
|
|
|
btrfs_ordered_update_i_size(inode, last_size, NULL);
|
2017-02-14 16:56:01 +00:00
|
|
|
}
|
2014-12-17 17:41:04 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
btrfs_free_path(path);
|
2014-12-17 17:41:04 +00:00
|
|
|
|
2018-05-11 20:13:30 +00:00
|
|
|
if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
|
2014-12-17 17:41:04 +00:00
|
|
|
unsigned long updates = trans->delayed_ref_updates;
|
2018-05-11 20:13:30 +00:00
|
|
|
int err;
|
|
|
|
|
2014-12-17 17:41:04 +00:00
|
|
|
if (updates) {
|
|
|
|
trans->delayed_ref_updates = 0;
|
2018-05-11 20:13:30 +00:00
|
|
|
err = btrfs_run_delayed_refs(trans, updates * 2);
|
|
|
|
if (err)
|
|
|
|
ret = err;
|
2014-12-17 17:41:04 +00:00
|
|
|
}
|
|
|
|
}
|
2018-05-11 20:13:30 +00:00
|
|
|
return ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-01-21 10:25:56 +00:00
|
|
|
* btrfs_truncate_block - read, zero a chunk and write a block
|
2012-08-29 18:27:18 +00:00
|
|
|
* @inode - inode that we're zeroing
|
|
|
|
* @from - the offset to start zeroing
|
|
|
|
* @len - the length to zero, 0 to zero the entire range respective to the
|
|
|
|
* offset
|
|
|
|
* @front - zero up to the offset instead of from the offset on
|
|
|
|
*
|
2016-01-21 10:25:56 +00:00
|
|
|
* This will find the block for the "from" offset and cow the block and zero the
|
2012-08-29 18:27:18 +00:00
|
|
|
* part we want to zero. This is used with truncate and hole punching.
|
2007-06-12 10:35:45 +00:00
|
|
|
*/
|
2016-01-21 10:25:56 +00:00
|
|
|
int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
|
2012-08-29 18:27:18 +00:00
|
|
|
int front)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2012-08-29 18:27:18 +00:00
|
|
|
struct address_space *mapping = inode->i_mapping;
|
2008-07-17 16:53:50 +00:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2017-02-27 07:10:38 +00:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2008-07-17 16:53:50 +00:00
|
|
|
char *kaddr;
|
2016-06-22 22:54:23 +00:00
|
|
|
u32 blocksize = fs_info->sectorsize;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
pgoff_t index = from >> PAGE_SHIFT;
|
2016-01-21 10:25:56 +00:00
|
|
|
unsigned offset = from & (blocksize - 1);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct page *page;
|
2011-09-21 19:05:58 +00:00
|
|
|
gfp_t mask = btrfs_alloc_write_mask(mapping);
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret = 0;
|
2016-01-21 10:25:56 +00:00
|
|
|
u64 block_start;
|
|
|
|
u64 block_end;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2018-01-18 12:47:06 +00:00
|
|
|
if (IS_ALIGNED(offset, blocksize) &&
|
|
|
|
(!len || IS_ALIGNED(len, blocksize)))
|
2007-06-12 10:35:45 +00:00
|
|
|
goto out;
|
2016-01-21 10:25:56 +00:00
|
|
|
|
2017-10-19 18:15:55 +00:00
|
|
|
block_start = round_down(from, blocksize);
|
|
|
|
block_end = block_start + blocksize - 1;
|
|
|
|
|
2017-02-27 07:10:38 +00:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
|
2017-10-19 18:15:55 +00:00
|
|
|
block_start, blocksize);
|
2009-10-13 20:46:49 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2008-05-15 13:13:45 +00:00
|
|
|
again:
|
2011-09-21 19:05:58 +00:00
|
|
|
page = find_or_create_page(mapping, index, mask);
|
2009-10-13 20:46:49 +00:00
|
|
|
if (!page) {
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
block_start, blocksize, true);
|
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
|
2012-12-05 10:56:13 +00:00
|
|
|
ret = -ENOMEM;
|
2007-06-12 10:35:45 +00:00
|
|
|
goto out;
|
2009-10-13 20:46:49 +00:00
|
|
|
}
|
2008-07-17 16:53:50 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
if (!PageUptodate(page)) {
|
2007-06-15 17:50:00 +00:00
|
|
|
ret = btrfs_readpage(NULL, page);
|
2007-06-12 10:35:45 +00:00
|
|
|
lock_page(page);
|
2008-05-15 13:13:45 +00:00
|
|
|
if (page->mapping != mapping) {
|
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
2008-05-15 13:13:45 +00:00
|
|
|
goto again;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
ret = -EIO;
|
2008-07-24 13:41:53 +00:00
|
|
|
goto out_unlock;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
}
|
2008-05-15 13:13:45 +00:00
|
|
|
wait_on_page_writeback(page);
|
2008-07-17 16:53:50 +00:00
|
|
|
|
2016-01-21 10:25:56 +00:00
|
|
|
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
|
2008-07-17 16:53:50 +00:00
|
|
|
set_page_extent_mapped(page);
|
|
|
|
|
2016-01-21 10:25:56 +00:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, block_start);
|
2008-07-17 16:53:50 +00:00
|
|
|
if (ordered) {
|
2016-01-21 10:25:56 +00:00
|
|
|
unlock_extent_cached(io_tree, block_start, block_end,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached_state);
|
2008-07-17 16:53:50 +00:00
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
2008-07-17 17:53:27 +00:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-07-17 16:53:50 +00:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2016-01-21 10:25:56 +00:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
|
2012-09-06 01:10:51 +00:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
|
2017-10-31 15:37:52 +00:00
|
|
|
0, 0, &cached_state);
|
2009-10-13 20:46:49 +00:00
|
|
|
|
2017-11-04 00:16:59 +00:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
|
2016-07-19 08:50:36 +00:00
|
|
|
&cached_state, 0);
|
2009-09-11 20:12:44 +00:00
|
|
|
if (ret) {
|
2016-01-21 10:25:56 +00:00
|
|
|
unlock_extent_cached(io_tree, block_start, block_end,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached_state);
|
2009-09-11 20:12:44 +00:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2016-01-21 10:25:56 +00:00
|
|
|
if (offset != blocksize) {
|
2012-08-29 18:27:18 +00:00
|
|
|
if (!len)
|
2016-01-21 10:25:56 +00:00
|
|
|
len = blocksize - offset;
|
2008-07-17 16:53:50 +00:00
|
|
|
kaddr = kmap(page);
|
2012-08-29 18:27:18 +00:00
|
|
|
if (front)
|
2016-01-21 10:25:56 +00:00
|
|
|
memset(kaddr + (block_start - page_offset(page)),
|
|
|
|
0, offset);
|
2012-08-29 18:27:18 +00:00
|
|
|
else
|
2016-01-21 10:25:56 +00:00
|
|
|
memset(kaddr + (block_start - page_offset(page)) + offset,
|
|
|
|
0, len);
|
2008-07-17 16:53:50 +00:00
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
2008-07-17 16:53:51 +00:00
|
|
|
ClearPageChecked(page);
|
2008-07-17 16:53:50 +00:00
|
|
|
set_page_dirty(page);
|
2017-12-12 20:43:52 +00:00
|
|
|
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2008-07-24 13:41:53 +00:00
|
|
|
out_unlock:
|
2009-10-13 20:46:49 +00:00
|
|
|
if (ret)
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved, block_start,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
blocksize, true);
|
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
|
2007-06-12 10:35:45 +00:00
|
|
|
unlock_page(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
2007-06-12 10:35:45 +00:00
|
|
|
out:
|
2017-02-27 07:10:38 +00:00
|
|
|
extent_changeset_free(data_reserved);
|
2007-06-12 10:35:45 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-22 16:18:51 +00:00
|
|
|
static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
|
|
|
|
u64 offset, u64 len)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2013-10-22 16:18:51 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Still need to make sure the inode looks like it's been updated so
|
|
|
|
* that any holes get logged if we fsync.
|
|
|
|
*/
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
|
|
|
|
BTRFS_I(inode)->last_trans = fs_info->generation;
|
2013-10-22 16:18:51 +00:00
|
|
|
BTRFS_I(inode)->last_sub_trans = root->log_transid;
|
|
|
|
BTRFS_I(inode)->last_log_commit = root->last_log_commit;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 1 - for the one we're dropping
|
|
|
|
* 1 - for the one we're adding
|
|
|
|
* 1 - for updating the inode.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
|
|
ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2013-10-22 16:18:51 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-01-20 13:54:07 +00:00
|
|
|
ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
|
|
|
|
offset, 0, 0, len, 0, len, 0, 0, 0);
|
2013-10-22 16:18:51 +00:00
|
|
|
if (ret)
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-10-22 16:18:51 +00:00
|
|
|
else
|
|
|
|
btrfs_update_inode(trans, root, inode);
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2013-10-22 16:18:51 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-04 20:46:53 +00:00
|
|
|
/*
|
|
|
|
* This function puts in dummy file extents for the area we're creating a hole
|
|
|
|
* for. So if we are truncating this file to a larger size we need to insert
|
|
|
|
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
|
|
|
|
* the range between oldsize and size
|
|
|
|
*/
|
2011-01-31 20:30:16 +00:00
|
|
|
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-10-30 18:19:41 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-05-16 14:48:46 +00:00
|
|
|
struct extent_map *em = NULL;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
2016-06-22 22:54:23 +00:00
|
|
|
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
|
|
|
|
u64 block_end = ALIGN(size, fs_info->sectorsize);
|
2008-10-30 18:19:41 +00:00
|
|
|
u64 last_byte;
|
|
|
|
u64 cur_offset;
|
|
|
|
u64 hole_size;
|
2009-09-11 20:12:44 +00:00
|
|
|
int err = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2013-06-17 21:14:39 +00:00
|
|
|
/*
|
2016-01-21 10:25:56 +00:00
|
|
|
* If our size started in the middle of a block we need to zero out the
|
|
|
|
* rest of the block before we expand the i_size, otherwise we could
|
2013-06-17 21:14:39 +00:00
|
|
|
* expose stale data.
|
|
|
|
*/
|
2016-01-21 10:25:56 +00:00
|
|
|
err = btrfs_truncate_block(inode, oldsize, 0, 0);
|
2013-06-17 21:14:39 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2008-10-30 18:19:41 +00:00
|
|
|
if (size <= hole_start)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
Btrfs: improve jitter performance of the sequential buffered write
The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.
It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.
So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.
According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.
Test Environment:
CPU: 1CPU * 4Cores
Memory: 6GB
Partition: 20GB
Test Command:
# sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
> --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-26 05:15:27 +00:00
|
|
|
|
2015-12-03 13:30:40 +00:00
|
|
|
lock_extent_bits(io_tree, hole_start, block_end - 1,
|
2012-03-01 13:57:19 +00:00
|
|
|
&cached_state);
|
2017-02-20 11:50:49 +00:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
|
Btrfs: improve jitter performance of the sequential buffered write
The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.
It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.
So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.
According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.
Test Environment:
CPU: 1CPU * 4Cores
Memory: 6GB
Partition: 20GB
Test Command:
# sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
> --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-26 05:15:27 +00:00
|
|
|
block_end - hole_start);
|
2008-10-30 18:19:41 +00:00
|
|
|
if (!ordered)
|
|
|
|
break;
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached_state);
|
Btrfs: improve jitter performance of the sequential buffered write
The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.
It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.
So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.
According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.
Test Environment:
CPU: 1CPU * 4Cores
Memory: 6GB
Partition: 20GB
Test Command:
# sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
> --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-26 05:15:27 +00:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-10-30 18:19:41 +00:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2008-10-30 18:19:41 +00:00
|
|
|
cur_offset = hole_start;
|
|
|
|
while (1) {
|
2017-02-20 11:51:06 +00:00
|
|
|
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
|
2008-10-30 18:19:41 +00:00
|
|
|
block_end - cur_offset, 0);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
err = PTR_ERR(em);
|
2013-01-08 19:37:58 +00:00
|
|
|
em = NULL;
|
2012-03-12 15:03:00 +00:00
|
|
|
break;
|
|
|
|
}
|
2008-10-30 18:19:41 +00:00
|
|
|
last_byte = min(extent_map_end(em), block_end);
|
2016-06-22 22:54:23 +00:00
|
|
|
last_byte = ALIGN(last_byte, fs_info->sectorsize);
|
2009-11-12 09:35:36 +00:00
|
|
|
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
struct extent_map *hole_em;
|
2008-10-30 18:19:41 +00:00
|
|
|
hole_size = last_byte - cur_offset;
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2013-10-22 16:18:51 +00:00
|
|
|
err = maybe_insert_hole(root, inode, cur_offset,
|
|
|
|
hole_size);
|
|
|
|
if (err)
|
2011-01-31 21:03:11 +00:00
|
|
|
break;
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
cur_offset + hole_size - 1, 0);
|
|
|
|
hole_em = alloc_extent_map();
|
|
|
|
if (!hole_em) {
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
hole_em->start = cur_offset;
|
|
|
|
hole_em->len = hole_size;
|
|
|
|
hole_em->orig_start = cur_offset;
|
2009-11-12 09:35:36 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
hole_em->block_start = EXTENT_MAP_HOLE;
|
|
|
|
hole_em->block_len = 0;
|
2012-12-03 15:31:19 +00:00
|
|
|
hole_em->orig_block_len = 0;
|
2013-04-04 18:31:27 +00:00
|
|
|
hole_em->ram_bytes = hole_size;
|
2016-06-22 22:54:23 +00:00
|
|
|
hole_em->bdev = fs_info->fs_devices->latest_bdev;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
hole_em->compress_type = BTRFS_COMPRESS_NONE;
|
2016-06-22 22:54:23 +00:00
|
|
|
hole_em->generation = fs_info->generation;
|
2009-11-12 09:35:36 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
while (1) {
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-05 20:51:15 +00:00
|
|
|
err = add_extent_mapping(em_tree, hole_em, 1);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
if (err != -EEXIST)
|
|
|
|
break;
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode),
|
|
|
|
cur_offset,
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
cur_offset +
|
|
|
|
hole_size - 1, 0);
|
|
|
|
}
|
|
|
|
free_extent_map(hole_em);
|
2008-10-30 18:19:41 +00:00
|
|
|
}
|
2013-10-22 16:18:51 +00:00
|
|
|
next:
|
2008-10-30 18:19:41 +00:00
|
|
|
free_extent_map(em);
|
2010-05-16 14:48:46 +00:00
|
|
|
em = NULL;
|
2008-10-30 18:19:41 +00:00
|
|
|
cur_offset = last_byte;
|
2009-11-12 09:35:36 +00:00
|
|
|
if (cur_offset >= block_end)
|
2008-10-30 18:19:41 +00:00
|
|
|
break;
|
|
|
|
}
|
2010-05-16 14:48:46 +00:00
|
|
|
free_extent_map(em);
|
2017-12-12 20:43:52 +00:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
|
2008-10-30 18:19:41 +00:00
|
|
|
return err;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2013-01-12 02:57:22 +00:00
|
|
|
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
|
2009-11-12 09:35:36 +00:00
|
|
|
{
|
2011-12-15 01:12:01 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2011-01-31 20:30:16 +00:00
|
|
|
loff_t oldsize = i_size_read(inode);
|
2013-01-12 02:57:22 +00:00
|
|
|
loff_t newsize = attr->ia_size;
|
|
|
|
int mask = attr->ia_valid;
|
2009-11-12 09:35:36 +00:00
|
|
|
int ret;
|
|
|
|
|
2013-01-12 02:57:22 +00:00
|
|
|
/*
|
|
|
|
* The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
|
|
|
|
* special case where we need to update the times despite not having
|
|
|
|
* these flags set. For all other operations the VFS set these flags
|
|
|
|
* explicitly if it wants a timestamp update.
|
|
|
|
*/
|
2013-11-19 15:17:07 +00:00
|
|
|
if (newsize != oldsize) {
|
|
|
|
inode_inc_iversion(inode);
|
|
|
|
if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
|
|
|
|
inode->i_ctime = inode->i_mtime =
|
2016-09-14 14:48:06 +00:00
|
|
|
current_time(inode);
|
2013-11-19 15:17:07 +00:00
|
|
|
}
|
2013-01-12 02:57:22 +00:00
|
|
|
|
2011-01-31 20:30:16 +00:00
|
|
|
if (newsize > oldsize) {
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
/*
|
2017-06-22 00:19:11 +00:00
|
|
|
* Don't do an expanding truncate while snapshotting is ongoing.
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
* This is to ensure the snapshot captures a fully consistent
|
|
|
|
* state of this file - if the snapshot captures this expanding
|
|
|
|
* truncation, it must capture all writes that happened before
|
|
|
|
* this truncation.
|
|
|
|
*/
|
2016-01-06 10:56:36 +00:00
|
|
|
btrfs_wait_for_snapshot_creation(root);
|
2011-01-31 20:30:16 +00:00
|
|
|
ret = btrfs_cont_expand(inode, oldsize, newsize);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
if (ret) {
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2009-11-12 09:35:36 +00:00
|
|
|
return ret;
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
|
2011-12-15 01:12:01 +00:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
if (IS_ERR(trans)) {
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2011-12-15 01:12:01 +00:00
|
|
|
return PTR_ERR(trans);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 11:57:59 +00:00
|
|
|
}
|
2011-12-15 01:12:01 +00:00
|
|
|
|
|
|
|
i_size_write(inode, newsize);
|
|
|
|
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
|
2016-01-21 10:26:03 +00:00
|
|
|
pagecache_isize_extended(inode, oldsize, newsize);
|
2011-12-15 01:12:01 +00:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2017-06-22 00:19:11 +00:00
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2011-01-31 20:30:16 +00:00
|
|
|
} else {
|
2009-11-12 09:35:36 +00:00
|
|
|
|
2011-01-31 20:30:16 +00:00
|
|
|
/*
|
|
|
|
* We're truncating a file that used to have good data down to
|
|
|
|
* zero. Make sure it gets into the ordered flush list so that
|
|
|
|
* any new writes get down to disk quickly.
|
|
|
|
*/
|
|
|
|
if (newsize == 0)
|
2012-05-23 18:13:11 +00:00
|
|
|
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2009-11-12 09:35:36 +00:00
|
|
|
|
2011-01-31 20:30:16 +00:00
|
|
|
truncate_setsize(inode, newsize);
|
2013-02-08 07:01:08 +00:00
|
|
|
|
|
|
|
/* Disable nonlocked read DIO to avoid the end less truncate */
|
2017-02-20 11:51:10 +00:00
|
|
|
btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
|
2013-02-08 07:01:08 +00:00
|
|
|
inode_dio_wait(inode);
|
2017-02-20 11:51:11 +00:00
|
|
|
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
|
2013-02-08 07:01:08 +00:00
|
|
|
|
2018-02-06 20:40:31 +00:00
|
|
|
ret = btrfs_truncate(inode, newsize == oldsize);
|
2013-08-29 20:43:28 +00:00
|
|
|
if (ret && inode->i_nlink) {
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/*
|
2018-05-11 20:13:32 +00:00
|
|
|
* Truncate failed, so fix up the in-memory size. We
|
|
|
|
* adjusted disk_i_size down as we removed extents, so
|
|
|
|
* wait for disk_i_size to be stable and then update the
|
|
|
|
* in-memory size to match.
|
2013-08-29 20:43:28 +00:00
|
|
|
*/
|
2018-05-11 20:13:32 +00:00
|
|
|
err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
2013-08-29 20:43:28 +00:00
|
|
|
if (err)
|
2018-05-11 20:13:32 +00:00
|
|
|
return err;
|
|
|
|
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
|
2013-08-29 20:43:28 +00:00
|
|
|
}
|
2009-11-12 09:35:36 +00:00
|
|
|
}
|
|
|
|
|
2011-01-31 20:30:16 +00:00
|
|
|
return ret;
|
2009-11-12 09:35:36 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 18:19:41 +00:00
|
|
|
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
|
|
|
|
{
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2010-12-20 08:04:08 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-10-30 18:19:41 +00:00
|
|
|
int err;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2010-12-20 08:04:08 +00:00
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
|
2016-05-26 14:55:18 +00:00
|
|
|
err = setattr_prepare(dentry, attr);
|
2008-10-30 18:19:41 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
2007-08-30 15:54:02 +00:00
|
|
|
|
2009-03-31 17:27:11 +00:00
|
|
|
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
|
2013-01-12 02:57:22 +00:00
|
|
|
err = btrfs_setsize(inode, attr);
|
2009-11-12 09:35:36 +00:00
|
|
|
if (err)
|
|
|
|
return err;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2008-10-30 18:19:41 +00:00
|
|
|
|
2010-06-04 09:30:02 +00:00
|
|
|
if (attr->ia_valid) {
|
|
|
|
setattr_copy(inode, attr);
|
2012-04-05 19:03:02 +00:00
|
|
|
inode_inc_iversion(inode);
|
2011-11-30 15:45:38 +00:00
|
|
|
err = btrfs_dirty_inode(inode);
|
2010-06-04 09:30:02 +00:00
|
|
|
|
2011-11-30 15:45:38 +00:00
|
|
|
if (!err && attr->ia_valid & ATTR_MODE)
|
2013-12-20 13:16:43 +00:00
|
|
|
err = posix_acl_chmod(inode, inode->i_mode);
|
2010-06-04 09:30:02 +00:00
|
|
|
}
|
2008-07-24 16:16:36 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
return err;
|
|
|
|
}
|
2008-01-14 21:24:38 +00:00
|
|
|
|
2013-11-19 22:29:35 +00:00
|
|
|
/*
|
|
|
|
* While truncating the inode pages during eviction, we get the VFS calling
|
|
|
|
* btrfs_invalidatepage() against each page of the inode. This is slow because
|
|
|
|
* the calls to btrfs_invalidatepage() result in a huge amount of calls to
|
|
|
|
* lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
|
|
|
|
* extent_state structures over and over, wasting lots of time.
|
|
|
|
*
|
|
|
|
* Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
|
|
|
|
* those expensive operations on a per page basis and do only the ordered io
|
|
|
|
* finishing, while we release here the extent_map and extent_state structures,
|
|
|
|
* without the excessive merging and splitting.
|
|
|
|
*/
|
|
|
|
static void evict_inode_truncate_pages(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct rb_node *node;
|
|
|
|
|
|
|
|
ASSERT(inode->i_state & I_FREEING);
|
2014-04-03 21:47:49 +00:00
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
write_lock(&map_tree->lock);
|
|
|
|
while (!RB_EMPTY_ROOT(&map_tree->map)) {
|
|
|
|
struct extent_map *em;
|
|
|
|
|
|
|
|
node = rb_first(&map_tree->map);
|
|
|
|
em = rb_entry(node, struct extent_map, rb_node);
|
2013-12-14 07:27:31 +00:00
|
|
|
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
|
2013-11-19 22:29:35 +00:00
|
|
|
remove_extent_mapping(map_tree, em);
|
|
|
|
free_extent_map(em);
|
2014-08-08 01:47:05 +00:00
|
|
|
if (need_resched()) {
|
|
|
|
write_unlock(&map_tree->lock);
|
|
|
|
cond_resched();
|
|
|
|
write_lock(&map_tree->lock);
|
|
|
|
}
|
2013-11-19 22:29:35 +00:00
|
|
|
}
|
|
|
|
write_unlock(&map_tree->lock);
|
|
|
|
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-25 23:55:42 +00:00
|
|
|
/*
|
|
|
|
* Keep looping until we have no more ranges in the io tree.
|
|
|
|
* We can have ongoing bios started by readpages (called from readahead)
|
2015-06-10 11:55:41 +00:00
|
|
|
* that have their endio callback (extent_io.c:end_bio_extent_readpage)
|
|
|
|
* still in progress (unlocked the pages in the bio but did not yet
|
|
|
|
* unlocked the ranges in the io tree). Therefore this means some
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-25 23:55:42 +00:00
|
|
|
* ranges can still be locked and eviction started because before
|
|
|
|
* submitting those bios, which are executed by a separate task (work
|
|
|
|
* queue kthread), inode references (inode->i_count) were not taken
|
|
|
|
* (which would be dropped in the end io callback of each bio).
|
|
|
|
* Therefore here we effectively end up waiting for those bios and
|
|
|
|
* anyone else holding locked ranges without having bumped the inode's
|
|
|
|
* reference count - if we don't do it, when they access the inode's
|
|
|
|
* io_tree to unlock a range it may be too late, leading to an
|
|
|
|
* use-after-free issue.
|
|
|
|
*/
|
2013-11-19 22:29:35 +00:00
|
|
|
spin_lock(&io_tree->lock);
|
|
|
|
while (!RB_EMPTY_ROOT(&io_tree->state)) {
|
|
|
|
struct extent_state *state;
|
|
|
|
struct extent_state *cached_state = NULL;
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-25 23:55:42 +00:00
|
|
|
u64 start;
|
|
|
|
u64 end;
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
node = rb_first(&io_tree->state);
|
|
|
|
state = rb_entry(node, struct extent_state, rb_node);
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-25 23:55:42 +00:00
|
|
|
start = state->start;
|
|
|
|
end = state->end;
|
2013-11-19 22:29:35 +00:00
|
|
|
spin_unlock(&io_tree->lock);
|
|
|
|
|
2015-12-03 13:30:40 +00:00
|
|
|
lock_extent_bits(io_tree, start, end, &cached_state);
|
2015-09-29 02:35:16 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If still has DELALLOC flag, the extent didn't reach disk,
|
|
|
|
* and its reserved space won't be freed by delayed_ref.
|
|
|
|
* So we need to free its reserved space here.
|
|
|
|
* (Refer to comment in btrfs_invalidatepage, case 2)
|
|
|
|
*
|
|
|
|
* Note, end is the bytenr of last byte, so we need + 1 here.
|
|
|
|
*/
|
|
|
|
if (state->state & EXTENT_DELALLOC)
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
|
2015-09-29 02:35:16 +00:00
|
|
|
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-25 23:55:42 +00:00
|
|
|
clear_extent_bit(io_tree, start, end,
|
2013-11-19 22:29:35 +00:00
|
|
|
EXTENT_LOCKED | EXTENT_DIRTY |
|
|
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
|
2017-10-31 15:37:52 +00:00
|
|
|
EXTENT_DEFRAG, 1, 1, &cached_state);
|
2013-11-19 22:29:35 +00:00
|
|
|
|
2014-08-08 01:47:05 +00:00
|
|
|
cond_resched();
|
2013-11-19 22:29:35 +00:00
|
|
|
spin_lock(&io_tree->lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&io_tree->lock);
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:13:36 +00:00
|
|
|
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
|
|
|
|
struct btrfs_block_rsv *rsv,
|
|
|
|
u64 min_size)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
|
|
|
|
int failures = 0;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = btrfs_block_rsv_refill(root, rsv, min_size,
|
|
|
|
BTRFS_RESERVE_FLUSH_LIMIT);
|
|
|
|
|
|
|
|
if (ret && ++failures > 2) {
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"could not allocate space for a delete; will truncate on mount");
|
|
|
|
return ERR_PTR(-ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans) || !ret)
|
|
|
|
return trans;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to steal from the global reserve if there is space for
|
|
|
|
* it.
|
|
|
|
*/
|
|
|
|
if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
|
|
|
|
!btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
|
|
|
|
return trans;
|
|
|
|
|
|
|
|
/* If not, commit and try again. */
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-06-07 15:35:40 +00:00
|
|
|
void btrfs_evict_inode(struct inode *inode)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2018-05-11 20:13:36 +00:00
|
|
|
struct btrfs_block_rsv *rsv;
|
2016-06-29 06:46:41 +00:00
|
|
|
u64 min_size;
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret;
|
|
|
|
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
trace_btrfs_inode_evict(inode);
|
|
|
|
|
2016-06-29 06:46:41 +00:00
|
|
|
if (!root) {
|
2018-01-25 18:02:53 +00:00
|
|
|
clear_inode(inode);
|
2016-06-29 06:46:41 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
|
2016-06-29 06:46:41 +00:00
|
|
|
|
2013-11-19 22:29:35 +00:00
|
|
|
evict_inode_truncate_pages(inode);
|
|
|
|
|
2013-09-05 14:58:43 +00:00
|
|
|
if (inode->i_nlink &&
|
|
|
|
((btrfs_root_refs(&root->root_item) != 0 &&
|
|
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
|
2017-02-20 11:50:35 +00:00
|
|
|
btrfs_is_free_space_inode(BTRFS_I(inode))))
|
2010-06-07 15:35:40 +00:00
|
|
|
goto no_delete;
|
|
|
|
|
2018-05-11 20:13:37 +00:00
|
|
|
if (is_bad_inode(inode))
|
2007-06-12 10:35:45 +00:00
|
|
|
goto no_delete;
|
2010-06-07 15:35:40 +00:00
|
|
|
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
|
2015-09-12 01:44:17 +00:00
|
|
|
if (!special_file(inode->i_mode))
|
|
|
|
btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2017-02-20 11:50:57 +00:00
|
|
|
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
|
Btrfs: cleanup the read failure record after write or when the inode is freeing
After the data is written successfully, we should cleanup the read failure record
in that range because
- If we set data COW for the file, the range that the failure record pointed to is
mapped to a new place, so it is invalid.
- If we set no data COW for the file, and if there is no error during writting,
the corrupted data is corrected, so the failure record can be removed. And if
some errors happen on the mirrors, we also needn't worry about it because the
failure record will be recreated if we read the same place again.
Sometimes, we may fail to correct the data, so the failure records will be left
in the tree, we need free them when we free the inode or the memory leak happens.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-12 10:44:04 +00:00
|
|
|
|
2018-05-11 20:13:33 +00:00
|
|
|
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
|
2009-11-12 09:34:40 +00:00
|
|
|
goto no_delete;
|
|
|
|
|
2009-09-21 20:00:26 +00:00
|
|
|
if (inode->i_nlink > 0) {
|
2013-09-05 14:58:43 +00:00
|
|
|
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
|
|
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
|
2009-09-21 20:00:26 +00:00
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
|
2017-01-10 18:35:40 +00:00
|
|
|
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
|
2018-05-11 20:13:37 +00:00
|
|
|
if (ret)
|
2012-12-19 06:59:51 +00:00
|
|
|
goto no_delete;
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
|
2018-05-11 20:13:37 +00:00
|
|
|
if (!rsv)
|
2011-08-05 17:22:24 +00:00
|
|
|
goto no_delete;
|
2011-08-29 15:01:31 +00:00
|
|
|
rsv->size = min_size;
|
2012-08-27 21:48:15 +00:00
|
|
|
rsv->failfast = 1;
|
2011-08-05 17:22:24 +00:00
|
|
|
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), 0);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
while (1) {
|
2018-05-11 20:13:36 +00:00
|
|
|
trans = evict_refill_and_join(root, rsv, min_size);
|
2018-05-11 20:13:37 +00:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
goto free_rsv;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2011-08-05 17:22:24 +00:00
|
|
|
trans->block_rsv = rsv;
|
|
|
|
|
2010-05-16 14:49:58 +00:00
|
|
|
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
|
2018-05-11 20:13:37 +00:00
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
|
|
|
if (ret && ret != -ENOSPC && ret != -EAGAIN)
|
|
|
|
goto free_rsv;
|
|
|
|
else if (!ret)
|
2009-11-12 09:35:36 +00:00
|
|
|
break;
|
|
|
|
}
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2013-08-13 18:10:08 +00:00
|
|
|
/*
|
2018-05-11 20:13:37 +00:00
|
|
|
* Errors here aren't a big deal, it just means we leave orphan items in
|
|
|
|
* the tree. They will be cleaned up on the next mount. If the inode
|
|
|
|
* number gets reused, cleanup deletes the orphan item without doing
|
|
|
|
* anything, and unlink reuses the existing orphan item.
|
|
|
|
*
|
|
|
|
* If it turns out that we are dropping too many of these, we might want
|
|
|
|
* to add a mechanism for retrying these after a commit.
|
2013-08-13 18:10:08 +00:00
|
|
|
*/
|
2018-05-11 20:13:37 +00:00
|
|
|
trans = evict_refill_and_join(root, rsv, min_size);
|
|
|
|
if (!IS_ERR(trans)) {
|
|
|
|
trans->block_rsv = rsv;
|
|
|
|
btrfs_orphan_del(trans, BTRFS_I(inode));
|
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
}
|
2007-06-22 18:16:25 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
if (!(root == fs_info->tree_root ||
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
|
2017-01-10 18:35:31 +00:00
|
|
|
btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
|
2018-05-11 20:13:37 +00:00
|
|
|
free_rsv:
|
|
|
|
btrfs_free_block_rsv(fs_info, rsv);
|
2007-06-12 10:35:45 +00:00
|
|
|
no_delete:
|
2018-05-11 20:13:37 +00:00
|
|
|
/*
|
|
|
|
* If we didn't successfully delete, the orphan item will still be in
|
|
|
|
* the tree and we'll retry on the next mount. Again, we might also want
|
|
|
|
* to retry these periodically in the future.
|
|
|
|
*/
|
2017-01-10 18:35:39 +00:00
|
|
|
btrfs_remove_delayed_node(BTRFS_I(inode));
|
2012-05-03 12:48:02 +00:00
|
|
|
clear_inode(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this returns the key found in the dir entry in the location pointer.
|
2018-03-05 09:13:37 +00:00
|
|
|
* If no dir entries were found, returns -ENOENT.
|
|
|
|
* If found a corrupted location in dir entry, returns -EUCLEAN.
|
2007-06-12 10:35:45 +00:00
|
|
|
*/
|
|
|
|
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
|
|
|
|
struct btrfs_key *location)
|
|
|
|
{
|
|
|
|
const char *name = dentry->d_name.name;
|
|
|
|
int namelen = dentry->d_name.len;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-10-25 19:48:28 +00:00
|
|
|
int ret = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 17:38:47 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2007-12-12 19:38:19 +00:00
|
|
|
|
2017-01-20 13:54:07 +00:00
|
|
|
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
|
|
|
|
name, namelen, 0);
|
2018-03-05 09:13:37 +00:00
|
|
|
if (!di) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (IS_ERR(di)) {
|
2007-10-25 19:48:28 +00:00
|
|
|
ret = PTR_ERR(di);
|
2018-03-05 09:13:37 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2009-01-06 02:25:51 +00:00
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
|
2017-10-30 17:14:38 +00:00
|
|
|
if (location->type != BTRFS_INODE_ITEM_KEY &&
|
|
|
|
location->type != BTRFS_ROOT_ITEM_KEY) {
|
2018-03-05 09:13:37 +00:00
|
|
|
ret = -EUCLEAN;
|
2017-10-30 17:14:38 +00:00
|
|
|
btrfs_warn(root->fs_info,
|
|
|
|
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
|
|
|
|
__func__, name, btrfs_ino(BTRFS_I(dir)),
|
|
|
|
location->objectid, location->type, location->offset);
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* when we hit a tree root in a directory, the btrfs part of the inode
|
|
|
|
* needs to be changed to reflect the root directory of the tree root. This
|
|
|
|
* is kind of like crossing a mount point.
|
|
|
|
*/
|
2016-06-22 22:54:24 +00:00
|
|
|
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
|
2009-09-21 19:56:00 +00:00
|
|
|
struct inode *dir,
|
|
|
|
struct dentry *dentry,
|
|
|
|
struct btrfs_key *location,
|
|
|
|
struct btrfs_root **sub_root)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2009-09-21 19:56:00 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *new_root;
|
|
|
|
struct btrfs_root_ref *ref;
|
|
|
|
struct extent_buffer *leaf;
|
2015-01-02 18:36:14 +00:00
|
|
|
struct btrfs_key key;
|
2009-09-21 19:56:00 +00:00
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
err = -ENOENT;
|
2015-01-02 18:36:14 +00:00
|
|
|
key.objectid = BTRFS_I(dir)->root->root_key.objectid;
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
key.offset = location->objectid;
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
|
2009-09-21 19:56:00 +00:00
|
|
|
if (ret) {
|
|
|
|
if (ret < 0)
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
|
2017-01-10 18:35:31 +00:00
|
|
|
if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
|
2009-09-21 19:56:00 +00:00
|
|
|
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
|
|
|
|
goto out;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
|
|
|
|
(unsigned long)(ref + 1),
|
|
|
|
dentry->d_name.len);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 19:56:00 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
new_root = btrfs_read_fs_root_no_name(fs_info, location);
|
2009-09-21 19:56:00 +00:00
|
|
|
if (IS_ERR(new_root)) {
|
|
|
|
err = PTR_ERR(new_root);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
*sub_root = new_root;
|
|
|
|
location->objectid = btrfs_root_dirid(&new_root->root_item);
|
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
location->offset = 0;
|
|
|
|
err = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return err;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
static void inode_tree_add(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_inode *entry;
|
2009-08-21 08:09:44 +00:00
|
|
|
struct rb_node **p;
|
|
|
|
struct rb_node *parent;
|
2013-09-02 11:19:13 +00:00
|
|
|
struct rb_node *new = &BTRFS_I(inode)->rb_node;
|
2017-01-10 18:35:31 +00:00
|
|
|
u64 ino = btrfs_ino(BTRFS_I(inode));
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
|
2010-10-23 19:19:20 +00:00
|
|
|
if (inode_unhashed(inode))
|
2009-09-21 20:00:26 +00:00
|
|
|
return;
|
2013-05-15 07:48:16 +00:00
|
|
|
parent = NULL;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
spin_lock(&root->inode_lock);
|
2013-05-15 07:48:16 +00:00
|
|
|
p = &root->inode_tree.rb_node;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct btrfs_inode, rb_node);
|
|
|
|
|
2018-06-29 08:56:40 +00:00
|
|
|
if (ino < btrfs_ino(entry))
|
2009-08-21 08:09:44 +00:00
|
|
|
p = &parent->rb_left;
|
2018-06-29 08:56:40 +00:00
|
|
|
else if (ino > btrfs_ino(entry))
|
2009-08-21 08:09:44 +00:00
|
|
|
p = &parent->rb_right;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
else {
|
|
|
|
WARN_ON(!(entry->vfs_inode.i_state &
|
2010-06-02 21:38:30 +00:00
|
|
|
(I_WILL_FREE | I_FREEING)));
|
2013-09-02 11:19:13 +00:00
|
|
|
rb_replace_node(parent, new, &root->inode_tree);
|
2009-08-21 08:09:44 +00:00
|
|
|
RB_CLEAR_NODE(parent);
|
|
|
|
spin_unlock(&root->inode_lock);
|
2013-09-02 11:19:13 +00:00
|
|
|
return;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
}
|
|
|
|
}
|
2013-09-02 11:19:13 +00:00
|
|
|
rb_link_node(new, parent, p);
|
|
|
|
rb_insert_color(new, &root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void inode_tree_del(struct inode *inode)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2009-09-21 20:00:26 +00:00
|
|
|
int empty = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
|
2009-08-21 08:09:44 +00:00
|
|
|
spin_lock(&root->inode_lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
|
|
|
|
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
|
|
|
|
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
|
2009-09-21 20:00:26 +00:00
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
}
|
2009-08-21 08:09:44 +00:00
|
|
|
spin_unlock(&root->inode_lock);
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2013-09-05 14:58:43 +00:00
|
|
|
if (empty && btrfs_root_refs(&root->root_item) == 0) {
|
2016-06-22 22:54:23 +00:00
|
|
|
synchronize_srcu(&fs_info->subvol_srcu);
|
2009-09-21 20:00:26 +00:00
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
if (empty)
|
|
|
|
btrfs_add_dead_root(root);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
|
2008-09-05 20:13:11 +00:00
|
|
|
static int btrfs_init_locked_inode(struct inode *inode, void *p)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args *args = p;
|
2014-01-10 01:28:00 +00:00
|
|
|
inode->i_ino = args->location->objectid;
|
|
|
|
memcpy(&BTRFS_I(inode)->location, args->location,
|
|
|
|
sizeof(*args->location));
|
2008-09-05 20:13:11 +00:00
|
|
|
BTRFS_I(inode)->root = args->root;
|
2007-06-12 10:35:45 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_find_actor(struct inode *inode, void *opaque)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args *args = opaque;
|
2014-01-10 01:28:00 +00:00
|
|
|
return args->location->objectid == BTRFS_I(inode)->location.objectid &&
|
2009-01-06 02:25:51 +00:00
|
|
|
args->root == BTRFS_I(inode)->root;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
static struct inode *btrfs_iget_locked(struct super_block *s,
|
2014-01-10 01:28:00 +00:00
|
|
|
struct btrfs_key *location,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
struct btrfs_root *root)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_iget_args args;
|
2014-01-10 01:28:00 +00:00
|
|
|
unsigned long hashval = btrfs_inode_hash(location->objectid, root);
|
Btrfs: improve inode hash function/inode lookup
Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:
1) When we have many subvolumes. Each subvolume has its own btree
where its files and directories are added to, and each has its
own objectid (inode number) namespace. This means that if we have
N subvolumes, and all have inode number X associated to a file or
directory, the corresponding inodes all map to the same hash table
entry, resulting in a bucket (hlist_head list) with N elements;
2) On 32 bits machines. Th VFS hash values are unsigned longs, which
are 32 bits wide on 32 bits machines, and the inode (objectid)
numbers are 64 bits unsigned integers. We simply cast the inode
numbers to hash values, which means that for all inodes with the
same 32 bits lower half, the same hash bucket is used for all of
them. For example, all inodes with a number (objectid) between
0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
the same hash table bucket.
This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-10-06 21:22:33 +00:00
|
|
|
|
2014-01-10 01:28:00 +00:00
|
|
|
args.location = location;
|
2007-06-12 10:35:45 +00:00
|
|
|
args.root = root;
|
|
|
|
|
Btrfs: improve inode hash function/inode lookup
Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:
1) When we have many subvolumes. Each subvolume has its own btree
where its files and directories are added to, and each has its
own objectid (inode number) namespace. This means that if we have
N subvolumes, and all have inode number X associated to a file or
directory, the corresponding inodes all map to the same hash table
entry, resulting in a bucket (hlist_head list) with N elements;
2) On 32 bits machines. Th VFS hash values are unsigned longs, which
are 32 bits wide on 32 bits machines, and the inode (objectid)
numbers are 64 bits unsigned integers. We simply cast the inode
numbers to hash values, which means that for all inodes with the
same 32 bits lower half, the same hash bucket is used for all of
them. For example, all inodes with a number (objectid) between
0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
the same hash table bucket.
This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-10-06 21:22:33 +00:00
|
|
|
inode = iget5_locked(s, hashval, btrfs_find_actor,
|
2007-06-12 10:35:45 +00:00
|
|
|
btrfs_init_locked_inode,
|
|
|
|
(void *)&args);
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2008-07-20 20:31:04 +00:00
|
|
|
/* Get an inode object given its location and corresponding root.
|
|
|
|
* Returns in *is_new if the inode was read from disk
|
|
|
|
*/
|
|
|
|
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 17:38:27 +00:00
|
|
|
struct btrfs_root *root, int *new)
|
2008-07-20 20:31:04 +00:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
2014-01-10 01:28:00 +00:00
|
|
|
inode = btrfs_iget_locked(s, location, root);
|
2008-07-20 20:31:04 +00:00
|
|
|
if (!inode)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2008-07-20 20:31:04 +00:00
|
|
|
|
|
|
|
if (inode->i_state & I_NEW) {
|
2016-06-06 10:51:25 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = btrfs_read_locked_inode(inode);
|
2011-07-12 18:25:31 +00:00
|
|
|
if (!is_bad_inode(inode)) {
|
|
|
|
inode_tree_add(inode);
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
if (new)
|
|
|
|
*new = 1;
|
|
|
|
} else {
|
2011-09-11 14:52:24 +00:00
|
|
|
unlock_new_inode(inode);
|
|
|
|
iput(inode);
|
2016-06-06 10:51:25 +00:00
|
|
|
ASSERT(ret < 0);
|
|
|
|
inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
|
2011-07-12 18:25:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-20 20:31:04 +00:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
static struct inode *new_simple_dir(struct super_block *s,
|
|
|
|
struct btrfs_key *key,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct inode *inode = new_inode(s);
|
|
|
|
|
|
|
|
if (!inode)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
BTRFS_I(inode)->root = root;
|
|
|
|
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
|
2012-05-23 18:13:11 +00:00
|
|
|
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
|
2009-09-21 19:56:00 +00:00
|
|
|
|
|
|
|
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
|
2012-02-21 09:04:28 +00:00
|
|
|
inode->i_op = &btrfs_dir_ro_inode_operations;
|
2017-01-26 01:06:39 +00:00
|
|
|
inode->i_opflags &= ~IOP_XATTR;
|
2009-09-21 19:56:00 +00:00
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
|
2016-09-14 14:48:06 +00:00
|
|
|
inode->i_mtime = current_time(inode);
|
2012-07-04 07:18:07 +00:00
|
|
|
inode->i_atime = inode->i_mtime;
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
2018-06-21 16:04:06 +00:00
|
|
|
BTRFS_I(inode)->i_otime = inode->i_mtime;
|
2009-09-21 19:56:00 +00:00
|
|
|
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2008-11-18 02:02:50 +00:00
|
|
|
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2009-01-06 02:25:51 +00:00
|
|
|
struct inode *inode;
|
2009-09-21 19:56:00 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *sub_root = root;
|
|
|
|
struct btrfs_key location;
|
2009-09-21 20:00:26 +00:00
|
|
|
int index;
|
2011-06-28 20:18:59 +00:00
|
|
|
int ret = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
if (dentry->d_name.len > BTRFS_NAME_LEN)
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2012-11-28 16:30:53 +00:00
|
|
|
ret = btrfs_inode_by_name(dir, dentry, &location);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (ret < 0)
|
|
|
|
return ERR_PTR(ret);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
if (location.type == BTRFS_INODE_ITEM_KEY) {
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 17:38:27 +00:00
|
|
|
inode = btrfs_iget(dir->i_sb, &location, root, NULL);
|
2009-09-21 19:56:00 +00:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
index = srcu_read_lock(&fs_info->subvol_srcu);
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = fixup_tree_root_location(fs_info, dir, dentry,
|
2009-09-21 19:56:00 +00:00
|
|
|
&location, &sub_root);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret != -ENOENT)
|
|
|
|
inode = ERR_PTR(ret);
|
|
|
|
else
|
|
|
|
inode = new_simple_dir(dir->i_sb, &location, sub_root);
|
|
|
|
} else {
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 17:38:27 +00:00
|
|
|
inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2016-06-22 22:54:23 +00:00
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2011-01-24 19:55:19 +00:00
|
|
|
if (!IS_ERR(inode) && root != sub_root) {
|
2016-06-22 22:54:23 +00:00
|
|
|
down_read(&fs_info->cleanup_work_sem);
|
2017-07-17 07:45:34 +00:00
|
|
|
if (!sb_rdonly(inode->i_sb))
|
2011-01-31 21:22:42 +00:00
|
|
|
ret = btrfs_orphan_cleanup(sub_root);
|
2016-06-22 22:54:23 +00:00
|
|
|
up_read(&fs_info->cleanup_work_sem);
|
2013-06-04 01:39:49 +00:00
|
|
|
if (ret) {
|
|
|
|
iput(inode);
|
2011-01-31 21:22:42 +00:00
|
|
|
inode = ERR_PTR(ret);
|
2013-06-04 01:39:49 +00:00
|
|
|
}
|
2009-11-12 09:34:40 +00:00
|
|
|
}
|
|
|
|
|
2008-11-18 02:02:50 +00:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2011-01-07 06:49:23 +00:00
|
|
|
static int btrfs_dentry_delete(const struct dentry *dentry)
|
2009-09-21 20:00:26 +00:00
|
|
|
{
|
|
|
|
struct btrfs_root *root;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2012-02-21 09:04:28 +00:00
|
|
|
if (!inode && !IS_ROOT(dentry))
|
2015-03-17 22:25:59 +00:00
|
|
|
inode = d_inode(dentry->d_parent);
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2012-02-21 09:04:28 +00:00
|
|
|
if (inode) {
|
|
|
|
root = BTRFS_I(inode)->root;
|
2009-10-09 13:25:16 +00:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
|
|
|
return 1;
|
2012-02-21 09:04:28 +00:00
|
|
|
|
2017-01-10 18:35:31 +00:00
|
|
|
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2012-02-21 09:04:28 +00:00
|
|
|
return 1;
|
2009-10-09 13:25:16 +00:00
|
|
|
}
|
2009-09-21 20:00:26 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-11-18 02:02:50 +00:00
|
|
|
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
|
2012-06-10 21:13:09 +00:00
|
|
|
unsigned int flags)
|
2008-11-18 02:02:50 +00:00
|
|
|
{
|
2013-12-13 00:51:42 +00:00
|
|
|
struct inode *inode;
|
2011-09-18 14:34:03 +00:00
|
|
|
|
2013-12-13 00:51:42 +00:00
|
|
|
inode = btrfs_lookup_dentry(dir, dentry);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
if (PTR_ERR(inode) == -ENOENT)
|
|
|
|
inode = NULL;
|
|
|
|
else
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
}
|
|
|
|
|
2014-10-13 02:24:21 +00:00
|
|
|
return d_splice_alias(inode, dentry);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
unsigned char btrfs_filetype_table[] = {
|
2007-06-12 10:35:45 +00:00
|
|
|
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
|
|
|
|
};
|
|
|
|
|
2017-07-24 19:14:25 +00:00
|
|
|
/*
|
|
|
|
* All this infrastructure exists because dir_emit can fault, and we are holding
|
|
|
|
* the tree lock when doing readdir. For now just allocate a buffer and copy
|
|
|
|
* our information into that, and then dir_emit from the buffer. This is
|
|
|
|
* similar to what NFS does, only we don't keep the buffer around in pagecache
|
|
|
|
* because I'm afraid I'll mess that up. Long term we need to make filldir do
|
|
|
|
* copy_to_user_inatomic so we don't have to worry about page faulting under the
|
|
|
|
* tree lock.
|
|
|
|
*/
|
|
|
|
static int btrfs_opendir(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct btrfs_file_private *private;
|
|
|
|
|
|
|
|
private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
|
|
|
|
if (!private)
|
|
|
|
return -ENOMEM;
|
|
|
|
private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
|
|
|
|
if (!private->filldir_buf) {
|
|
|
|
kfree(private);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
file->private_data = private;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct dir_entry {
|
|
|
|
u64 ino;
|
|
|
|
u64 offset;
|
|
|
|
unsigned type;
|
|
|
|
int name_len;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
|
|
|
|
{
|
|
|
|
while (entries--) {
|
|
|
|
struct dir_entry *entry = addr;
|
|
|
|
char *name = (char *)(entry + 1);
|
|
|
|
|
2018-04-16 19:10:14 +00:00
|
|
|
ctx->pos = get_unaligned(&entry->offset);
|
|
|
|
if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
|
|
|
|
get_unaligned(&entry->ino),
|
|
|
|
get_unaligned(&entry->type)))
|
2017-07-24 19:14:25 +00:00
|
|
|
return 1;
|
2018-04-16 19:10:14 +00:00
|
|
|
addr += sizeof(struct dir_entry) +
|
|
|
|
get_unaligned(&entry->name_len);
|
2017-07-24 19:14:25 +00:00
|
|
|
ctx->pos++;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-22 20:48:09 +00:00
|
|
|
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2013-05-22 20:48:09 +00:00
|
|
|
struct inode *inode = file_inode(file);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2017-07-24 19:14:25 +00:00
|
|
|
struct btrfs_file_private *private = file->private_data;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct btrfs_key found_key;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_path *path;
|
2017-07-24 19:14:25 +00:00
|
|
|
void *addr;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
struct list_head ins_list;
|
|
|
|
struct list_head del_list;
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 10:35:45 +00:00
|
|
|
int slot;
|
2007-10-15 20:14:19 +00:00
|
|
|
char *name_ptr;
|
|
|
|
int name_len;
|
2017-07-24 19:14:25 +00:00
|
|
|
int entries = 0;
|
|
|
|
int total_len = 0;
|
2016-05-20 20:50:33 +00:00
|
|
|
bool put = false;
|
2016-11-21 14:59:04 +00:00
|
|
|
struct btrfs_key location;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2013-05-22 20:48:09 +00:00
|
|
|
if (!dir_emit_dots(file, ctx))
|
|
|
|
return 0;
|
|
|
|
|
2008-08-17 16:08:36 +00:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2011-05-28 11:00:39 +00:00
|
|
|
|
2017-07-24 19:14:25 +00:00
|
|
|
addr = private->filldir_buf;
|
2015-11-27 15:31:35 +00:00
|
|
|
path->reada = READA_FORWARD;
|
2008-08-17 16:08:36 +00:00
|
|
|
|
2016-11-21 14:59:04 +00:00
|
|
|
INIT_LIST_HEAD(&ins_list);
|
|
|
|
INIT_LIST_HEAD(&del_list);
|
|
|
|
put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
|
2017-07-24 19:14:25 +00:00
|
|
|
again:
|
2016-11-21 14:59:04 +00:00
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
2013-05-22 20:48:09 +00:00
|
|
|
key.offset = ctx->pos;
|
2017-01-10 18:35:31 +00:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
2008-08-17 16:08:36 +00:00
|
|
|
|
|
|
|
while (1) {
|
2017-07-24 19:14:25 +00:00
|
|
|
struct dir_entry *entry;
|
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
2007-06-12 10:35:45 +00:00
|
|
|
slot = path->slots[0];
|
2011-03-23 02:43:58 +00:00
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2008-11-18 02:02:50 +00:00
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.objectid != key.objectid)
|
2007-06-12 10:35:45 +00:00
|
|
|
break;
|
2016-11-21 14:59:04 +00:00
|
|
|
if (found_key.type != BTRFS_DIR_INDEX_KEY)
|
2007-06-12 10:35:45 +00:00
|
|
|
break;
|
2013-05-22 20:48:09 +00:00
|
|
|
if (found_key.offset < ctx->pos)
|
2011-03-23 02:43:58 +00:00
|
|
|
goto next;
|
2016-11-21 14:59:04 +00:00
|
|
|
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
goto next;
|
2007-06-12 10:35:45 +00:00
|
|
|
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
|
2016-11-21 14:59:04 +00:00
|
|
|
name_len = btrfs_dir_name_len(leaf, di);
|
2017-07-24 19:14:25 +00:00
|
|
|
if ((total_len + sizeof(struct dir_entry) + name_len) >=
|
|
|
|
PAGE_SIZE) {
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
|
|
|
|
if (ret)
|
|
|
|
goto nopos;
|
|
|
|
addr = private->filldir_buf;
|
|
|
|
entries = 0;
|
|
|
|
total_len = 0;
|
|
|
|
goto again;
|
2016-11-21 14:59:04 +00:00
|
|
|
}
|
2017-07-24 19:14:25 +00:00
|
|
|
|
|
|
|
entry = addr;
|
2018-04-16 19:10:14 +00:00
|
|
|
put_unaligned(name_len, &entry->name_len);
|
2017-07-24 19:14:25 +00:00
|
|
|
name_ptr = (char *)(entry + 1);
|
2016-11-21 14:59:04 +00:00
|
|
|
read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
|
|
|
|
name_len);
|
2018-04-16 19:10:14 +00:00
|
|
|
put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
|
|
|
|
&entry->type);
|
2016-11-21 14:59:04 +00:00
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &location);
|
2018-04-16 19:10:14 +00:00
|
|
|
put_unaligned(location.objectid, &entry->ino);
|
|
|
|
put_unaligned(found_key.offset, &entry->offset);
|
2017-07-24 19:14:25 +00:00
|
|
|
entries++;
|
|
|
|
addr += sizeof(struct dir_entry) + name_len;
|
|
|
|
total_len += sizeof(struct dir_entry) + name_len;
|
2011-03-23 02:43:58 +00:00
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2017-07-24 19:14:25 +00:00
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
|
|
|
|
if (ret)
|
|
|
|
goto nopos;
|
2008-08-17 16:08:36 +00:00
|
|
|
|
2016-11-05 17:26:35 +00:00
|
|
|
ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
|
2016-11-21 14:59:04 +00:00
|
|
|
if (ret)
|
btrfs: properly set the termination value of ctx->pos in readdir
The value of ctx->pos in the last readdir call is supposed to be set to
INT_MAX due to 32bit compatibility, unless 'pos' is intentially set to a
larger value, then it's LLONG_MAX.
There's a report from PaX SIZE_OVERFLOW plugin that "ctx->pos++"
overflows (https://forums.grsecurity.net/viewtopic.php?f=1&t=4284), on a
64bit arch, where the value is 0x7fffffffffffffff ie. LLONG_MAX before
the increment.
We can get to that situation like that:
* emit all regular readdir entries
* still in the same call to readdir, bump the last pos to INT_MAX
* next call to readdir will not emit any entries, but will reach the
bump code again, finds pos to be INT_MAX and sets it to LLONG_MAX
Normally this is not a problem, but if we call readdir again, we'll find
'pos' set to LLONG_MAX and the unconditional increment will overflow.
The report from Victor at
(http://thread.gmane.org/gmane.comp.file-systems.btrfs/49500) with debugging
print shows that pattern:
Overflow: e
Overflow: 7fffffff
Overflow: 7fffffffffffffff
PAX: size overflow detected in function btrfs_real_readdir
fs/btrfs/inode.c:5760 cicus.935_282 max, count: 9, decl: pos; num: 0;
context: dir_context;
CPU: 0 PID: 2630 Comm: polkitd Not tainted 4.2.3-grsec #1
Hardware name: Gigabyte Technology Co., Ltd. H81ND2H/H81ND2H, BIOS F3 08/11/2015
ffffffff81901608 0000000000000000 ffffffff819015e6 ffffc90004973d48
ffffffff81742f0f 0000000000000007 ffffffff81901608 ffffc90004973d78
ffffffff811cb706 0000000000000000 ffff8800d47359e0 ffffc90004973ed8
Call Trace:
[<ffffffff81742f0f>] dump_stack+0x4c/0x7f
[<ffffffff811cb706>] report_size_overflow+0x36/0x40
[<ffffffff812ef0bc>] btrfs_real_readdir+0x69c/0x6d0
[<ffffffff811dafc8>] iterate_dir+0xa8/0x150
[<ffffffff811e6d8d>] ? __fget_light+0x2d/0x70
[<ffffffff811dba3a>] SyS_getdents+0xba/0x1c0
Overflow: 1a
[<ffffffff811db070>] ? iterate_dir+0x150/0x150
[<ffffffff81749b69>] entry_SYSCALL_64_fastpath+0x12/0x83
The jump from 7fffffff to 7fffffffffffffff happens when new dir entries
are not yet synced and are processed from the delayed list. Then the code
could go to the bump section again even though it might not emit any new
dir entries from the delayed list.
The fix avoids entering the "bump" section again once we've finished
emitting the entries, both for synced and delayed entries.
References: https://forums.grsecurity.net/viewtopic.php?f=1&t=4284
Reported-by: Victor <services@swwu.com>
CC: stable@vger.kernel.org
Signed-off-by: David Sterba <dsterba@suse.com>
Tested-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-13 12:44:28 +00:00
|
|
|
goto nopos;
|
|
|
|
|
2013-07-11 23:19:42 +00:00
|
|
|
/*
|
|
|
|
* Stop new entries from being returned after we return the last
|
|
|
|
* entry.
|
|
|
|
*
|
|
|
|
* New directory entries are assigned a strictly increasing
|
|
|
|
* offset. This means that new entries created during readdir
|
|
|
|
* are *guaranteed* to be seen in the future by that readdir.
|
|
|
|
* This has broken buggy programs which operate on names as
|
|
|
|
* they're returned by readdir. Until we re-use freed offsets
|
|
|
|
* we have this hack to stop new entries from being returned
|
|
|
|
* under the assumption that they'll never reach this huge
|
|
|
|
* offset.
|
|
|
|
*
|
|
|
|
* This is being careful not to overflow 32bit loff_t unless the
|
|
|
|
* last entry requires it because doing so has broken 32bit apps
|
|
|
|
* in the past.
|
|
|
|
*/
|
2016-11-21 14:59:04 +00:00
|
|
|
if (ctx->pos >= INT_MAX)
|
|
|
|
ctx->pos = LLONG_MAX;
|
|
|
|
else
|
|
|
|
ctx->pos = INT_MAX;
|
2007-06-12 10:35:45 +00:00
|
|
|
nopos:
|
|
|
|
ret = 0;
|
|
|
|
err:
|
2016-05-20 20:50:33 +00:00
|
|
|
if (put)
|
|
|
|
btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
|
2007-06-12 10:35:45 +00:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-06-22 18:16:25 +00:00
|
|
|
* This is somewhat expensive, updating the tree every time the
|
2007-06-12 10:35:45 +00:00
|
|
|
* inode changes. But, it is most likely to find the inode in cache.
|
|
|
|
* FIXME, needs more benchmarking...there are no reasons other than performance
|
|
|
|
* to keep or drop this code.
|
|
|
|
*/
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_dirty_inode(struct inode *inode)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-16 14:49:58 +00:00
|
|
|
int ret;
|
|
|
|
|
2012-05-23 18:13:11 +00:00
|
|
|
if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
|
2011-11-30 15:45:38 +00:00
|
|
|
return 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2011-04-13 16:54:33 +00:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-11-30 15:45:38 +00:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2010-05-16 14:49:58 +00:00
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2010-05-26 15:02:00 +00:00
|
|
|
if (ret && ret == -ENOSPC) {
|
|
|
|
/* whoops, lets try again with the full transaction */
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2010-05-26 15:02:00 +00:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2011-11-30 15:45:38 +00:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2010-05-16 14:49:58 +00:00
|
|
|
|
2010-05-26 15:02:00 +00:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
}
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
if (BTRFS_I(inode)->delayed_node)
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_balance_delayed_items(fs_info);
|
2011-11-30 15:45:38 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is a copy of file_update_time. We need this so we can return error on
|
|
|
|
* ENOSPC for updating the inode in the case of file write and mmap writes.
|
|
|
|
*/
|
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-09 02:36:02 +00:00
|
|
|
static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
|
2012-03-26 13:46:47 +00:00
|
|
|
int flags)
|
2011-11-30 15:45:38 +00:00
|
|
|
{
|
2012-06-15 07:49:33 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2017-12-11 11:35:24 +00:00
|
|
|
bool dirty = flags & ~S_VERSION;
|
2012-06-15 07:49:33 +00:00
|
|
|
|
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
|
2012-03-26 13:46:47 +00:00
|
|
|
if (flags & S_VERSION)
|
2017-12-11 11:35:24 +00:00
|
|
|
dirty |= inode_maybe_inc_iversion(inode, dirty);
|
2012-03-26 13:46:47 +00:00
|
|
|
if (flags & S_CTIME)
|
|
|
|
inode->i_ctime = *now;
|
|
|
|
if (flags & S_MTIME)
|
|
|
|
inode->i_mtime = *now;
|
|
|
|
if (flags & S_ATIME)
|
|
|
|
inode->i_atime = *now;
|
2017-12-11 11:35:24 +00:00
|
|
|
return dirty ? btrfs_dirty_inode(inode) : 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* find the highest existing sequence number in a directory
|
|
|
|
* and then set the in-memory index_cnt variable to reflect
|
|
|
|
* free sequence numbers
|
|
|
|
*/
|
2017-02-20 11:50:32 +00:00
|
|
|
static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
|
2008-07-24 16:12:38 +00:00
|
|
|
{
|
2017-02-20 11:50:32 +00:00
|
|
|
struct btrfs_root *root = inode->root;
|
2008-07-24 16:12:38 +00:00
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
int ret;
|
|
|
|
|
2017-02-20 11:50:32 +00:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2014-06-04 16:41:45 +00:00
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
2008-07-24 16:12:38 +00:00
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
/* FIXME: we should be able to handle this */
|
|
|
|
if (ret == 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MAGIC NUMBER EXPLANATION:
|
|
|
|
* since we search a directory based on f_pos we have to start at 2
|
|
|
|
* since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
|
|
|
|
* else has to start at 2
|
|
|
|
*/
|
|
|
|
if (path->slots[0] == 0) {
|
2017-02-20 11:50:32 +00:00
|
|
|
inode->index_cnt = 2;
|
2008-07-24 16:12:38 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
2017-02-20 11:50:32 +00:00
|
|
|
if (found_key.objectid != btrfs_ino(inode) ||
|
2014-06-04 16:41:45 +00:00
|
|
|
found_key.type != BTRFS_DIR_INDEX_KEY) {
|
2017-02-20 11:50:32 +00:00
|
|
|
inode->index_cnt = 2;
|
2008-07-24 16:12:38 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:50:32 +00:00
|
|
|
inode->index_cnt = found_key.offset + 1;
|
2008-07-24 16:12:38 +00:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* helper to find a free sequence number in a given directory. This current
|
|
|
|
* code is very simple, later versions will do smarter things in the btree
|
|
|
|
*/
|
2017-02-20 11:50:33 +00:00
|
|
|
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
|
2008-07-24 16:12:38 +00:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
2017-02-20 11:50:33 +00:00
|
|
|
if (dir->index_cnt == (u64)-1) {
|
|
|
|
ret = btrfs_inode_delayed_dir_index_count(dir);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
if (ret) {
|
|
|
|
ret = btrfs_set_inode_index_count(dir);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2008-07-24 16:12:38 +00:00
|
|
|
}
|
|
|
|
|
2017-02-20 11:50:33 +00:00
|
|
|
*index = dir->index_cnt;
|
|
|
|
dir->index_cnt++;
|
2008-07-24 16:12:38 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-09-08 20:08:51 +00:00
|
|
|
static int btrfs_insert_inode_locked(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args args;
|
|
|
|
args.location = &BTRFS_I(inode)->location;
|
|
|
|
args.root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
return insert_inode_locked4(inode,
|
|
|
|
btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
|
|
|
|
btrfs_find_actor, &args);
|
|
|
|
}
|
|
|
|
|
2017-07-18 09:37:05 +00:00
|
|
|
/*
|
|
|
|
* Inherit flags from the parent inode.
|
|
|
|
*
|
|
|
|
* Currently only the compression flags and the cow flags are inherited.
|
|
|
|
*/
|
|
|
|
static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
|
|
|
|
{
|
|
|
|
unsigned int flags;
|
|
|
|
|
|
|
|
if (!dir)
|
|
|
|
return;
|
|
|
|
|
|
|
|
flags = BTRFS_I(dir)->flags;
|
|
|
|
|
|
|
|
if (flags & BTRFS_INODE_NOCOMPRESS) {
|
|
|
|
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
|
|
|
|
} else if (flags & BTRFS_INODE_COMPRESS) {
|
|
|
|
BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (flags & BTRFS_INODE_NODATACOW) {
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
|
|
|
|
if (S_ISREG(inode->i_mode))
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
|
|
|
|
}
|
|
|
|
|
2018-03-26 16:40:21 +00:00
|
|
|
btrfs_sync_inode_flags_to_i_flags(inode);
|
2017-07-18 09:37:05 +00:00
|
|
|
}
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
2008-07-24 16:12:38 +00:00
|
|
|
struct inode *dir,
|
2008-01-29 20:15:18 +00:00
|
|
|
const char *name, int name_len,
|
2011-07-26 07:30:54 +00:00
|
|
|
u64 ref_objectid, u64 objectid,
|
|
|
|
umode_t mode, u64 *index)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct inode *inode;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct btrfs_inode_item *inode_item;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_key *location;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct btrfs_path *path;
|
2008-01-29 20:15:18 +00:00
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
struct btrfs_key key[2];
|
|
|
|
u32 sizes[2];
|
2014-04-27 19:40:45 +00:00
|
|
|
int nitems = name ? 2 : 1;
|
2008-01-29 20:15:18 +00:00
|
|
|
unsigned long ptr;
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret;
|
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 17:38:47 +00:00
|
|
|
if (!path)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
inode = new_inode(fs_info->sb);
|
2011-04-09 02:30:07 +00:00
|
|
|
if (!inode) {
|
|
|
|
btrfs_free_path(path);
|
2007-06-12 10:35:45 +00:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2011-04-09 02:30:07 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2014-07-31 23:10:32 +00:00
|
|
|
/*
|
|
|
|
* O_TMPFILE, set link count to 0, so that after this point,
|
|
|
|
* we fill in an inode item with the correct link count.
|
|
|
|
*/
|
|
|
|
if (!name)
|
|
|
|
set_nlink(inode, 0);
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
/*
|
|
|
|
* we have to initialize this early, so we can reclaim the inode
|
|
|
|
* number if we fail afterwards in this function.
|
|
|
|
*/
|
|
|
|
inode->i_ino = objectid;
|
|
|
|
|
2014-04-27 19:40:45 +00:00
|
|
|
if (dir && name) {
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
trace_btrfs_inode_request(dir);
|
|
|
|
|
2017-02-20 11:50:33 +00:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(dir), index);
|
2009-04-02 20:46:06 +00:00
|
|
|
if (ret) {
|
2011-04-09 02:30:07 +00:00
|
|
|
btrfs_free_path(path);
|
2009-04-02 20:46:06 +00:00
|
|
|
iput(inode);
|
2008-07-24 16:12:38 +00:00
|
|
|
return ERR_PTR(ret);
|
2009-04-02 20:46:06 +00:00
|
|
|
}
|
2014-04-27 19:40:45 +00:00
|
|
|
} else if (dir) {
|
|
|
|
*index = 0;
|
2008-07-24 16:12:38 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* index_cnt is ignored for everything but a dir,
|
2018-01-12 03:08:02 +00:00
|
|
|
* btrfs_set_inode_index_count has an explanation for the magic
|
2008-07-24 16:12:38 +00:00
|
|
|
* number
|
|
|
|
*/
|
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
2013-12-26 05:07:06 +00:00
|
|
|
BTRFS_I(inode)->dir_index = *index;
|
2007-06-12 10:35:45 +00:00
|
|
|
BTRFS_I(inode)->root = root;
|
2008-09-05 20:13:11 +00:00
|
|
|
BTRFS_I(inode)->generation = trans->transid;
|
2010-11-19 02:18:02 +00:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
2007-08-27 20:49:44 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
/*
|
|
|
|
* We could have gotten an inode number from somebody who was fsynced
|
|
|
|
* and then removed in this same transaction, so let's just set full
|
|
|
|
* sync since it will be a full sync anyway and this will blow away the
|
|
|
|
* old info in the log.
|
|
|
|
*/
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
|
|
|
|
|
2008-01-29 20:15:18 +00:00
|
|
|
key[0].objectid = objectid;
|
2014-06-04 16:41:45 +00:00
|
|
|
key[0].type = BTRFS_INODE_ITEM_KEY;
|
2008-01-29 20:15:18 +00:00
|
|
|
key[0].offset = 0;
|
|
|
|
|
|
|
|
sizes[0] = sizeof(struct btrfs_inode_item);
|
2014-04-27 19:40:45 +00:00
|
|
|
|
|
|
|
if (name) {
|
|
|
|
/*
|
|
|
|
* Start new inodes with an inode_ref. This is slightly more
|
|
|
|
* efficient for small numbers of hard links since they will
|
|
|
|
* be packed into one item. Extended refs will kick in if we
|
|
|
|
* add more hard links than can fit in the ref item.
|
|
|
|
*/
|
|
|
|
key[1].objectid = objectid;
|
2014-06-04 16:41:45 +00:00
|
|
|
key[1].type = BTRFS_INODE_REF_KEY;
|
2014-04-27 19:40:45 +00:00
|
|
|
key[1].offset = ref_objectid;
|
|
|
|
|
|
|
|
sizes[1] = name_len + sizeof(*ref);
|
|
|
|
}
|
2008-01-29 20:15:18 +00:00
|
|
|
|
2014-09-08 20:08:51 +00:00
|
|
|
location = &BTRFS_I(inode)->location;
|
|
|
|
location->objectid = objectid;
|
|
|
|
location->offset = 0;
|
2014-06-04 16:41:45 +00:00
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
ret = btrfs_insert_inode_locked(inode);
|
|
|
|
if (ret < 0)
|
|
|
|
goto fail;
|
|
|
|
|
2009-03-13 15:00:37 +00:00
|
|
|
path->leave_spinning = 1;
|
2014-04-27 19:40:45 +00:00
|
|
|
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
|
2008-01-29 20:15:18 +00:00
|
|
|
if (ret != 0)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto fail_unlock;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2010-03-04 14:31:47 +00:00
|
|
|
inode_init_owner(inode, dir, mode);
|
2008-10-09 15:46:29 +00:00
|
|
|
inode_set_bytes(inode, 0);
|
2012-07-04 07:18:07 +00:00
|
|
|
|
2016-09-14 14:48:06 +00:00
|
|
|
inode->i_mtime = current_time(inode);
|
2012-07-04 07:18:07 +00:00
|
|
|
inode->i_atime = inode->i_mtime;
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
2018-06-21 16:04:06 +00:00
|
|
|
BTRFS_I(inode)->i_otime = inode->i_mtime;
|
2012-07-04 07:18:07 +00:00
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
2016-11-08 17:09:03 +00:00
|
|
|
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
|
2012-07-10 06:58:58 +00:00
|
|
|
sizeof(*inode_item));
|
2008-09-05 20:13:11 +00:00
|
|
|
fill_inode_item(trans, path->nodes[0], inode_item, inode);
|
2008-01-29 20:15:18 +00:00
|
|
|
|
2014-04-27 19:40:45 +00:00
|
|
|
if (name) {
|
|
|
|
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
|
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
|
|
|
|
ptr = (unsigned long)(ref + 1);
|
|
|
|
write_extent_buffer(path->nodes[0], name, ptr, name_len);
|
|
|
|
}
|
2008-01-29 20:15:18 +00:00
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2009-04-17 08:37:41 +00:00
|
|
|
btrfs_inherit_iflags(inode, dir);
|
|
|
|
|
2011-07-24 21:08:40 +00:00
|
|
|
if (S_ISREG(mode)) {
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_test_opt(fs_info, NODATASUM))
|
2009-07-02 16:26:06 +00:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
|
2016-06-22 22:54:23 +00:00
|
|
|
if (btrfs_test_opt(fs_info, NODATACOW))
|
2013-02-21 20:28:28 +00:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
|
|
|
|
BTRFS_INODE_NODATASUM;
|
2009-07-02 16:26:06 +00:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
inode_tree_add(inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
|
|
|
trace_btrfs_inode_new(inode);
|
2011-06-24 17:13:29 +00:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2012-07-25 15:35:53 +00:00
|
|
|
btrfs_update_root_times(trans, root);
|
|
|
|
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
ret = btrfs_inode_inherit_props(trans, inode, dir);
|
|
|
|
if (ret)
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_err(fs_info,
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
"error inheriting props for ino %llu (root %llu): %d",
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
return inode;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
fail_unlock:
|
|
|
|
unlock_new_inode(inode);
|
2007-10-15 20:14:19 +00:00
|
|
|
fail:
|
2014-04-27 19:40:45 +00:00
|
|
|
if (dir && name)
|
2008-07-24 16:12:38 +00:00
|
|
|
BTRFS_I(dir)->index_cnt--;
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_free_path(path);
|
2009-04-02 20:46:06 +00:00
|
|
|
iput(inode);
|
2007-10-15 20:14:19 +00:00
|
|
|
return ERR_PTR(ret);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline u8 btrfs_inode_type(struct inode *inode)
|
|
|
|
{
|
|
|
|
return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* utility function to add 'inode' into 'parent_inode' with
|
|
|
|
* a give name and a given sequence number.
|
|
|
|
* if 'add_backref' is true, also insert a backref from the
|
|
|
|
* inode to the parent directory.
|
|
|
|
*/
|
2008-09-05 20:13:11 +00:00
|
|
|
int btrfs_add_link(struct btrfs_trans_handle *trans,
|
2017-02-20 11:51:08 +00:00
|
|
|
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
|
2008-09-05 20:13:11 +00:00
|
|
|
const char *name, int name_len, int add_backref, u64 index)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2009-09-21 19:56:00 +00:00
|
|
|
int ret = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_key key;
|
2017-02-20 11:51:08 +00:00
|
|
|
struct btrfs_root *root = parent_inode->root;
|
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 parent_ino = btrfs_ino(parent_inode);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2017-02-20 11:51:08 +00:00
|
|
|
memcpy(&key, &inode->root->root_key, sizeof(key));
|
2009-09-21 19:56:00 +00:00
|
|
|
} else {
|
2011-04-20 02:31:50 +00:00
|
|
|
key.objectid = ino;
|
2014-06-04 16:41:45 +00:00
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
2009-09-21 19:56:00 +00:00
|
|
|
key.offset = 0;
|
|
|
|
}
|
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2018-08-01 03:32:29 +00:00
|
|
|
ret = btrfs_add_root_ref(trans, key.objectid,
|
2016-06-22 22:54:23 +00:00
|
|
|
root->root_key.objectid, parent_ino,
|
|
|
|
index, name, name_len);
|
2009-09-21 19:56:00 +00:00
|
|
|
} else if (add_backref) {
|
2011-04-20 02:31:50 +00:00
|
|
|
ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
parent_ino, index);
|
2009-09-21 19:56:00 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2012-03-12 15:03:00 +00:00
|
|
|
/* Nothing to clean up yet */
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2009-09-21 19:56:00 +00:00
|
|
|
|
2012-03-12 15:03:00 +00:00
|
|
|
ret = btrfs_insert_dir_item(trans, root, name, name_len,
|
|
|
|
parent_inode, &key,
|
2017-02-20 11:51:08 +00:00
|
|
|
btrfs_inode_type(&inode->vfs_inode), index);
|
2012-12-17 19:26:57 +00:00
|
|
|
if (ret == -EEXIST || ret == -EOVERFLOW)
|
2012-03-12 15:03:00 +00:00
|
|
|
goto fail_dir_item;
|
|
|
|
else if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
return ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2012-03-12 15:03:00 +00:00
|
|
|
|
2017-02-20 11:51:08 +00:00
|
|
|
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
|
2012-03-12 15:03:00 +00:00
|
|
|
name_len * 2);
|
2017-02-20 11:51:08 +00:00
|
|
|
inode_inc_iversion(&parent_inode->vfs_inode);
|
|
|
|
parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
|
|
|
|
current_time(&parent_inode->vfs_inode);
|
|
|
|
ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret)
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2007-06-12 10:35:45 +00:00
|
|
|
return ret;
|
2012-02-20 13:40:56 +00:00
|
|
|
|
|
|
|
fail_dir_item:
|
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
|
|
|
u64 local_index;
|
|
|
|
int err;
|
2018-08-01 03:32:28 +00:00
|
|
|
err = btrfs_del_root_ref(trans, key.objectid,
|
2016-06-22 22:54:23 +00:00
|
|
|
root->root_key.objectid, parent_ino,
|
|
|
|
&local_index, name, name_len);
|
2012-02-20 13:40:56 +00:00
|
|
|
|
|
|
|
} else if (add_backref) {
|
|
|
|
u64 local_index;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = btrfs_del_inode_ref(trans, root, name, name_len,
|
|
|
|
ino, parent_ino, &local_index);
|
|
|
|
}
|
|
|
|
return ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
|
2017-02-20 11:51:09 +00:00
|
|
|
struct btrfs_inode *dir, struct dentry *dentry,
|
|
|
|
struct btrfs_inode *inode, int backref, u64 index)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2010-11-19 20:36:11 +00:00
|
|
|
int err = btrfs_add_link(trans, dir, inode,
|
|
|
|
dentry->d_name.name, dentry->d_name.len,
|
|
|
|
backref, index);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (err > 0)
|
|
|
|
err = -EEXIST;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2007-07-11 14:18:17 +00:00
|
|
|
static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
|
2011-07-26 05:52:52 +00:00
|
|
|
umode_t mode, dev_t rdev)
|
2007-07-11 14:18:17 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2007-07-11 14:18:17 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-12-21 21:27:21 +00:00
|
|
|
struct inode *inode = NULL;
|
2007-07-11 14:18:17 +00:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
u64 objectid;
|
2008-08-05 15:18:09 +00:00
|
|
|
u64 index = 0;
|
2007-07-11 14:18:17 +00:00
|
|
|
|
2009-09-11 20:12:44 +00:00
|
|
|
/*
|
|
|
|
* 2 for inode item and ref
|
|
|
|
* 2 for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 14:48:46 +00:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-12-21 21:27:21 +00:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-24 16:12:38 +00:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2017-01-20 13:54:07 +00:00
|
|
|
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
|
|
|
|
mode, &index);
|
2011-04-25 23:43:53 +00:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-07-11 14:18:17 +00:00
|
|
|
goto out_unlock;
|
2011-04-25 23:43:53 +00:00
|
|
|
}
|
2007-07-11 14:18:17 +00:00
|
|
|
|
2011-12-15 15:09:07 +00:00
|
|
|
/*
|
|
|
|
* If the active LSM wants to access the inode during
|
|
|
|
* d_instantiate it needs these. Smack checks to see
|
|
|
|
* if the filesystem supports xattrs by looking at the
|
|
|
|
* ops vector.
|
|
|
|
*/
|
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
2014-09-08 20:08:51 +00:00
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
|
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2007-07-11 14:18:17 +00:00
|
|
|
if (err)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_unlock_inode;
|
|
|
|
|
2017-02-20 11:51:09 +00:00
|
|
|
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
|
|
|
|
0, index);
|
2014-09-08 20:08:51 +00:00
|
|
|
if (err) {
|
|
|
|
goto out_unlock_inode;
|
|
|
|
} else {
|
2007-08-29 13:11:44 +00:00
|
|
|
btrfs_update_inode(trans, root, inode);
|
2018-05-04 12:23:01 +00:00
|
|
|
d_instantiate_new(dentry, inode);
|
2007-07-11 14:18:17 +00:00
|
|
|
}
|
2014-09-08 20:08:51 +00:00
|
|
|
|
2007-07-11 14:18:17 +00:00
|
|
|
out_unlock:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-07-11 14:18:17 +00:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
return err;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
out_unlock_inode:
|
|
|
|
drop_inode = 1;
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_unlock;
|
|
|
|
|
2007-07-11 14:18:17 +00:00
|
|
|
}
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
static int btrfs_create(struct inode *dir, struct dentry *dentry,
|
2012-06-10 22:05:36 +00:00
|
|
|
umode_t mode, bool excl)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-12-21 21:27:21 +00:00
|
|
|
struct inode *inode = NULL;
|
2012-11-30 03:40:09 +00:00
|
|
|
int drop_inode_on_err = 0;
|
2010-05-16 14:48:46 +00:00
|
|
|
int err;
|
2007-06-12 10:35:45 +00:00
|
|
|
u64 objectid;
|
2008-08-05 15:18:09 +00:00
|
|
|
u64 index = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-11 20:12:44 +00:00
|
|
|
/*
|
|
|
|
* 2 for inode item and ref
|
|
|
|
* 2 for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 14:48:46 +00:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2009-09-11 20:12:44 +00:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-24 16:12:38 +00:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2017-01-20 13:54:07 +00:00
|
|
|
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
|
|
|
|
mode, &index);
|
2011-04-25 23:43:53 +00:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
goto out_unlock;
|
2011-04-25 23:43:53 +00:00
|
|
|
}
|
2012-11-30 03:40:09 +00:00
|
|
|
drop_inode_on_err = 1;
|
2011-12-15 15:09:07 +00:00
|
|
|
/*
|
|
|
|
* If the active LSM wants to access the inode during
|
|
|
|
* d_instantiate it needs these. Smack checks to see
|
|
|
|
* if the filesystem supports xattrs by looking at the
|
|
|
|
* ops vector.
|
|
|
|
*/
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
2014-09-08 20:08:51 +00:00
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
|
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock_inode;
|
|
|
|
|
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock_inode;
|
2011-12-15 15:09:07 +00:00
|
|
|
|
2017-02-20 11:51:09 +00:00
|
|
|
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
|
|
|
|
0, index);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (err)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_unlock_inode;
|
2012-11-30 03:40:09 +00:00
|
|
|
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2018-05-04 12:23:01 +00:00
|
|
|
d_instantiate_new(dentry, inode);
|
2012-11-30 03:40:09 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
out_unlock:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2012-11-30 03:40:09 +00:00
|
|
|
if (err && drop_inode_on_err) {
|
2007-06-12 10:35:45 +00:00
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-06-12 10:35:45 +00:00
|
|
|
return err;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
out_unlock_inode:
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_unlock;
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
|
|
|
|
struct dentry *dentry)
|
|
|
|
{
|
2016-01-05 16:24:05 +00:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(old_dentry);
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-08-05 15:18:09 +00:00
|
|
|
u64 index;
|
2007-06-12 10:35:45 +00:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
|
2009-11-12 07:14:26 +00:00
|
|
|
/* do not allow sys_link's with other subvols of the same device */
|
|
|
|
if (root->objectid != BTRFS_I(inode)->root->objectid)
|
2011-03-22 17:20:26 +00:00
|
|
|
return -EXDEV;
|
2009-11-12 07:14:26 +00:00
|
|
|
|
2012-08-08 18:32:27 +00:00
|
|
|
if (inode->i_nlink >= BTRFS_LINK_MAX)
|
2011-03-04 17:15:18 +00:00
|
|
|
return -EMLINK;
|
2009-11-12 07:14:26 +00:00
|
|
|
|
2017-02-20 11:50:33 +00:00
|
|
|
err = btrfs_set_inode_index(BTRFS_I(dir), &index);
|
2008-07-24 16:12:38 +00:00
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
|
2010-05-16 14:48:46 +00:00
|
|
|
/*
|
2011-02-18 09:21:17 +00:00
|
|
|
* 2 items for inode and inode ref
|
2010-05-16 14:48:46 +00:00
|
|
|
* 2 items for dir items
|
2011-02-18 09:21:17 +00:00
|
|
|
* 1 item for parent inode
|
2018-05-11 20:13:40 +00:00
|
|
|
* 1 item for orphan item deletion if O_TMPFILE
|
2010-05-16 14:48:46 +00:00
|
|
|
*/
|
2018-05-11 20:13:40 +00:00
|
|
|
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
|
2010-05-16 14:48:46 +00:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
2016-01-05 16:24:05 +00:00
|
|
|
trans = NULL;
|
2010-05-16 14:48:46 +00:00
|
|
|
goto fail;
|
|
|
|
}
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2013-12-26 05:07:06 +00:00
|
|
|
/* There are several dir indexes for this inode, clear the cache. */
|
|
|
|
BTRFS_I(inode)->dir_index = 0ULL;
|
2013-10-16 19:10:34 +00:00
|
|
|
inc_nlink(inode);
|
2012-04-05 19:03:02 +00:00
|
|
|
inode_inc_iversion(inode);
|
2016-09-14 14:48:06 +00:00
|
|
|
inode->i_ctime = current_time(inode);
|
2010-10-23 15:11:40 +00:00
|
|
|
ihold(inode);
|
2012-10-11 19:53:56 +00:00
|
|
|
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
|
2008-07-24 16:12:38 +00:00
|
|
|
|
2017-02-20 11:51:09 +00:00
|
|
|
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
|
|
|
|
1, index);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2009-09-24 13:17:31 +00:00
|
|
|
if (err) {
|
2007-06-22 18:16:25 +00:00
|
|
|
drop_inode = 1;
|
2009-09-24 13:17:31 +00:00
|
|
|
} else {
|
2011-07-17 03:09:10 +00:00
|
|
|
struct dentry *parent = dentry->d_parent;
|
2009-09-24 13:17:31 +00:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (err)
|
|
|
|
goto fail;
|
2014-04-27 19:40:45 +00:00
|
|
|
if (inode->i_nlink == 1) {
|
|
|
|
/*
|
|
|
|
* If new hard link count is 1, it's a file created
|
|
|
|
* with open(2) O_TMPFILE flag.
|
|
|
|
*/
|
2017-02-20 11:50:58 +00:00
|
|
|
err = btrfs_orphan_del(trans, BTRFS_I(inode));
|
2014-04-27 19:40:45 +00:00
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
}
|
2011-12-23 12:58:13 +00:00
|
|
|
d_instantiate(dentry, inode);
|
2017-01-17 22:31:31 +00:00
|
|
|
btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
|
2009-09-24 13:17:31 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2007-12-21 21:27:21 +00:00
|
|
|
fail:
|
2016-01-05 16:24:05 +00:00
|
|
|
if (trans)
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-06-12 10:35:45 +00:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2011-07-26 05:41:39 +00:00
|
|
|
static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2008-05-02 20:13:49 +00:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
int err = 0;
|
|
|
|
int drop_on_err = 0;
|
2008-05-02 20:13:49 +00:00
|
|
|
u64 objectid = 0;
|
2008-08-05 15:18:09 +00:00
|
|
|
u64 index = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-11 20:12:44 +00:00
|
|
|
/*
|
|
|
|
* 2 items for inode and ref
|
|
|
|
* 2 items for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 14:48:46 +00:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_fail;
|
|
|
|
|
2008-07-24 16:12:38 +00:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2017-01-20 13:54:07 +00:00
|
|
|
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
|
|
|
|
S_IFDIR | mode, &index);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
drop_on_err = 1;
|
2014-09-08 20:08:51 +00:00
|
|
|
/* these must be set before we unlock the inode */
|
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
2008-07-24 16:16:36 +00:00
|
|
|
|
2011-02-01 16:05:39 +00:00
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2008-07-24 16:16:36 +00:00
|
|
|
if (err)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_fail_inode;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), 0);
|
2007-06-12 10:35:45 +00:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (err)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_fail_inode;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2017-02-20 11:51:08 +00:00
|
|
|
err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
|
|
|
|
dentry->d_name.name,
|
|
|
|
dentry->d_name.len, 0, index);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (err)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_fail_inode;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2018-05-04 12:23:01 +00:00
|
|
|
d_instantiate_new(dentry, inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
drop_on_err = 0;
|
|
|
|
|
|
|
|
out_fail:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2014-12-24 06:45:30 +00:00
|
|
|
if (drop_on_err) {
|
|
|
|
inode_dec_link_count(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
iput(inode);
|
2014-12-24 06:45:30 +00:00
|
|
|
}
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-06-12 10:35:45 +00:00
|
|
|
return err;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
out_fail_inode:
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_fail;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
static noinline int uncompress_inline(struct btrfs_path *path,
|
2015-05-19 14:46:45 +00:00
|
|
|
struct page *page,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
size_t pg_offset, u64 extent_offset,
|
|
|
|
struct btrfs_file_extent_item *item)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
char *tmp;
|
|
|
|
size_t max_size;
|
|
|
|
unsigned long inline_size;
|
|
|
|
unsigned long ptr;
|
2010-12-17 06:21:50 +00:00
|
|
|
int compress_type;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
|
|
|
|
WARN_ON(pg_offset != 0);
|
2010-12-17 06:21:50 +00:00
|
|
|
compress_type = btrfs_file_extent_compression(leaf, item);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
max_size = btrfs_file_extent_ram_bytes(leaf, item);
|
|
|
|
inline_size = btrfs_file_extent_inline_item_len(leaf,
|
2013-09-16 14:58:09 +00:00
|
|
|
btrfs_item_nr(path->slots[0]));
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
tmp = kmalloc(inline_size, GFP_NOFS);
|
2011-04-25 23:43:52 +00:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
ptr = btrfs_file_extent_inline_start(item);
|
|
|
|
|
|
|
|
read_extent_buffer(leaf, tmp, ptr, inline_size);
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
|
2010-12-17 06:21:50 +00:00
|
|
|
ret = btrfs_decompress(compress_type, tmp, page,
|
|
|
|
extent_offset, inline_size, max_size);
|
btrfs: add missing memset while reading compressed inline extents
This is a story about 4 distinct (and very old) btrfs bugs.
Commit c8b978188c ("Btrfs: Add zlib compression support") added
three data corruption bugs for inline extents (bugs #1-3).
Commit 93c82d5750 ("Btrfs: zero page past end of inline file items")
fixed bug #1: uncompressed inline extents followed by a hole and more
extents could get non-zero data in the hole as they were read. The fix
was to add a memset in btrfs_get_extent to zero out the hole.
Commit 166ae5a418 ("btrfs: fix inline compressed read err corruption")
fixed bug #2: compressed inline extents which contained non-zero bytes
might be replaced with zero bytes in some cases. This patch removed an
unhelpful memset from uncompress_inline, but the case where memset is
required was missed.
There is also a memset in the decompression code, but this only covers
decompressed data that is shorter than the ram_bytes from the extent
ref record. This memset doesn't cover the region between the end of the
decompressed data and the end of the page. It has also moved around a
few times over the years, so there's no single patch to refer to.
This patch fixes bug #3: compressed inline extents followed by a hole
and more extents could get non-zero data in the hole as they were read
(i.e. bug #3 is the same as bug #1, but s/uncompressed/compressed/).
The fix is the same: zero out the hole in the compressed case too,
by putting a memset back in uncompress_inline, but this time with
correct parameters.
The last and oldest bug, bug #0, is the cause of the offending inline
extent/hole/extent pattern. Bug #0 is a subtle and mostly-harmless quirk
of behavior somewhere in the btrfs write code. In a few special cases,
an inline extent and hole are allowed to persist where they normally
would be combined with later extents in the file.
A fast reproducer for bug #0 is presented below. A few offending extents
are also created in the wild during large rsync transfers with the -S
flag. A Linux kernel build (git checkout; make allyesconfig; make -j8)
will produce a handful of offending files as well. Once an offending
file is created, it can present different content to userspace each
time it is read.
Bug #0 is at least 4 and possibly 8 years old. I verified every vX.Y
kernel back to v3.5 has this behavior. There are fossil records of this
bug's effects in commits all the way back to v2.6.32. I have no reason
to believe bug #0 wasn't present at the beginning of btrfs compression
support in v2.6.29, but I can't easily test kernels that old to be sure.
It is not clear whether bug #0 is worth fixing. A fix would likely
require injecting extra reads into currently write-only paths, and most
of the exceptional cases caused by bug #0 are already handled now.
Whether we like them or not, bug #0's inline extents followed by holes
are part of the btrfs de-facto disk format now, and we need to be able
to read them without data corruption or an infoleak. So enough about
bug #0, let's get back to bug #3 (this patch).
An example of on-disk structure leading to data corruption found in
the wild:
item 61 key (606890 INODE_ITEM 0) itemoff 9662 itemsize 160
inode generation 50 transid 50 size 47424 nbytes 49141
block group 0 mode 100644 links 1 uid 0 gid 0
rdev 0 flags 0x0(none)
item 62 key (606890 INODE_REF 603050) itemoff 9642 itemsize 20
inode ref index 3 namelen 10 name: DB_File.so
item 63 key (606890 EXTENT_DATA 0) itemoff 8280 itemsize 1362
inline extent data size 1341 ram 4085 compress(zlib)
item 64 key (606890 EXTENT_DATA 4096) itemoff 8227 itemsize 53
extent data disk byte 5367308288 nr 20480
extent data offset 0 nr 45056 ram 45056
extent compression(zlib)
Different data appears in userspace during each read of the 11 bytes
between 4085 and 4096. The extent in item 63 is not long enough to
fill the first page of the file, so a memset is required to fill the
space between item 63 (ending at 4085) and item 64 (beginning at 4096)
with zero.
Here is a reproducer from Liu Bo, which demonstrates another method
of creating the same inline extent and hole pattern:
Using 'page_poison=on' kernel command line (or enable
CONFIG_PAGE_POISONING) run the following:
# touch foo
# chattr +c foo
# xfs_io -f -c "pwrite -W 0 1000" foo
# xfs_io -f -c "falloc 4 8188" foo
# od -x foo
# echo 3 >/proc/sys/vm/drop_caches
# od -x foo
This produce the following on my box:
Correct output: file contains 1000 data bytes followed
by zeros:
0000000 cdcd cdcd cdcd cdcd cdcd cdcd cdcd cdcd
*
0001740 cdcd cdcd cdcd cdcd 0000 0000 0000 0000
0001760 0000 0000 0000 0000 0000 0000 0000 0000
*
0020000
Actual output: the data after the first 1000 bytes
will be different each run:
0000000 cdcd cdcd cdcd cdcd cdcd cdcd cdcd cdcd
*
0001740 cdcd cdcd cdcd cdcd 6c63 7400 635f 006d
0001760 5f74 6f43 7400 435f 0053 5f74 7363 7400
0002000 435f 0056 5f74 6164 7400 645f 0062 5f74
(...)
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Chris Mason <clm@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2017-03-10 21:45:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* decompression code contains a memset to fill in any space between the end
|
|
|
|
* of the uncompressed data and the end of max_size in case the decompressed
|
|
|
|
* data ends up shorter than ram_bytes. That doesn't cover the hole between
|
|
|
|
* the end of an inline extent and the beginning of the next block, so we
|
|
|
|
* cover that region here.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (max_size + pg_offset < PAGE_SIZE) {
|
|
|
|
char *map = kmap(page);
|
|
|
|
memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
kfree(tmp);
|
2014-05-09 21:15:10 +00:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* a bit scary, this does extent mapping from logical file offset to the disk.
|
2009-01-06 02:25:51 +00:00
|
|
|
* the ugly parts come from merging extents from the disk with the in-ram
|
|
|
|
* representation. This gets more complex because of the data=ordered code,
|
2008-09-29 19:18:18 +00:00
|
|
|
* where the in-ram extents might be locked pending data=ordered completion.
|
|
|
|
*
|
|
|
|
* This also copies inline extents directly into the page.
|
|
|
|
*/
|
2017-02-20 11:51:06 +00:00
|
|
|
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
|
|
|
|
struct page *page,
|
|
|
|
size_t pg_offset, u64 start, u64 len,
|
|
|
|
int create)
|
2007-08-27 20:49:44 +00:00
|
|
|
{
|
2018-06-29 08:56:42 +00:00
|
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
2007-08-27 20:49:44 +00:00
|
|
|
int ret;
|
|
|
|
int err = 0;
|
|
|
|
u64 extent_start = 0;
|
|
|
|
u64 extent_end = 0;
|
2017-02-20 11:51:06 +00:00
|
|
|
u64 objectid = btrfs_ino(inode);
|
2007-08-27 20:49:44 +00:00
|
|
|
u32 found_type;
|
2008-07-22 15:18:09 +00:00
|
|
|
struct btrfs_path *path = NULL;
|
2017-02-20 11:51:06 +00:00
|
|
|
struct btrfs_root *root = inode->root;
|
2007-08-27 20:49:44 +00:00
|
|
|
struct btrfs_file_extent_item *item;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key found_key;
|
2007-08-27 20:49:44 +00:00
|
|
|
struct extent_map *em = NULL;
|
2017-02-20 11:51:06 +00:00
|
|
|
struct extent_map_tree *em_tree = &inode->extent_tree;
|
|
|
|
struct extent_io_tree *io_tree = &inode->io_tree;
|
2014-06-09 02:48:05 +00:00
|
|
|
const bool new_inline = !page || create;
|
2007-08-27 20:49:44 +00:00
|
|
|
|
2009-09-02 20:24:52 +00:00
|
|
|
read_lock(&em_tree->lock);
|
2008-01-24 21:13:08 +00:00
|
|
|
em = lookup_extent_mapping(em_tree, start, len);
|
2008-05-07 15:43:44 +00:00
|
|
|
if (em)
|
2016-06-22 22:54:23 +00:00
|
|
|
em->bdev = fs_info->fs_devices->latest_bdev;
|
2009-09-02 20:24:52 +00:00
|
|
|
read_unlock(&em_tree->lock);
|
2008-01-24 21:13:08 +00:00
|
|
|
|
2007-08-27 20:49:44 +00:00
|
|
|
if (em) {
|
2008-04-22 17:26:46 +00:00
|
|
|
if (em->start > start || em->start + em->len <= start)
|
|
|
|
free_extent_map(em);
|
|
|
|
else if (em->block_start == EXTENT_MAP_INLINE && page)
|
2008-01-29 14:59:12 +00:00
|
|
|
free_extent_map(em);
|
|
|
|
else
|
|
|
|
goto out;
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
2011-04-20 22:48:27 +00:00
|
|
|
em = alloc_extent_map();
|
2007-08-27 20:49:44 +00:00
|
|
|
if (!em) {
|
2008-01-24 21:13:08 +00:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
2016-06-22 22:54:23 +00:00
|
|
|
em->bdev = fs_info->fs_devices->latest_bdev;
|
2008-01-24 21:13:08 +00:00
|
|
|
em->start = EXTENT_MAP_HOLE;
|
2008-11-10 16:53:33 +00:00
|
|
|
em->orig_start = EXTENT_MAP_HOLE;
|
2008-01-24 21:13:08 +00:00
|
|
|
em->len = (u64)-1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
em->block_len = (u64)-1;
|
2008-07-22 15:18:09 +00:00
|
|
|
|
|
|
|
if (!path) {
|
|
|
|
path = btrfs_alloc_path();
|
2011-05-13 14:32:11 +00:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Chances are we'll be called again, so go ahead and do
|
|
|
|
* readahead
|
|
|
|
*/
|
2015-11-27 15:31:35 +00:00
|
|
|
path->reada = READA_FORWARD;
|
2008-07-22 15:18:09 +00:00
|
|
|
}
|
|
|
|
|
2017-12-01 09:19:40 +00:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
|
2007-08-27 20:49:44 +00:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret != 0) {
|
|
|
|
if (path->slots[0] == 0)
|
|
|
|
goto not_found;
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0],
|
2007-08-27 20:49:44 +00:00
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
/* are we inside the extent that was found? */
|
2007-10-15 20:14:19 +00:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
2014-06-04 16:41:45 +00:00
|
|
|
found_type = found_key.type;
|
2007-10-15 20:14:19 +00:00
|
|
|
if (found_key.objectid != objectid ||
|
2007-08-27 20:49:44 +00:00
|
|
|
found_type != BTRFS_EXTENT_DATA_KEY) {
|
2013-10-14 16:08:38 +00:00
|
|
|
/*
|
|
|
|
* If we backup past the first extent we want to move forward
|
|
|
|
* and see if there is an extent in front of us, otherwise we'll
|
|
|
|
* say there is a hole for our whole search range which can
|
|
|
|
* cause problems.
|
|
|
|
*/
|
|
|
|
extent_end = start;
|
|
|
|
goto next;
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
|
|
|
|
2007-10-15 20:14:19 +00:00
|
|
|
found_type = btrfs_file_extent_type(leaf, item);
|
|
|
|
extent_start = found_key.offset;
|
2008-10-30 18:25:28 +00:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2007-08-27 20:49:44 +00:00
|
|
|
extent_end = extent_start +
|
2007-10-15 20:15:53 +00:00
|
|
|
btrfs_file_extent_num_bytes(leaf, item);
|
2017-03-10 19:09:48 +00:00
|
|
|
|
|
|
|
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
|
|
|
|
extent_start);
|
2008-10-30 18:19:41 +00:00
|
|
|
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
size_t size;
|
2018-06-06 07:41:49 +00:00
|
|
|
|
|
|
|
size = btrfs_file_extent_ram_bytes(leaf, item);
|
2016-06-15 13:22:56 +00:00
|
|
|
extent_end = ALIGN(extent_start + size,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize);
|
2017-03-10 19:09:48 +00:00
|
|
|
|
|
|
|
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
|
|
|
|
path->slots[0],
|
|
|
|
extent_start);
|
2008-10-30 18:19:41 +00:00
|
|
|
}
|
2013-10-14 16:08:38 +00:00
|
|
|
next:
|
2008-10-30 18:19:41 +00:00
|
|
|
if (start >= extent_end) {
|
|
|
|
path->slots[0]++;
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
2008-10-30 18:19:41 +00:00
|
|
|
if (ret > 0)
|
|
|
|
goto not_found;
|
|
|
|
leaf = path->nodes[0];
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
2008-10-30 18:19:41 +00:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
if (found_key.objectid != objectid ||
|
|
|
|
found_key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto not_found;
|
|
|
|
if (start + len <= found_key.offset)
|
|
|
|
goto not_found;
|
2014-07-17 03:44:14 +00:00
|
|
|
if (start > found_key.offset)
|
|
|
|
goto next;
|
2008-10-30 18:19:41 +00:00
|
|
|
em->start = start;
|
2012-10-11 20:54:30 +00:00
|
|
|
em->orig_start = start;
|
2008-10-30 18:19:41 +00:00
|
|
|
em->len = found_key.offset - start;
|
|
|
|
goto not_found_em;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:51:06 +00:00
|
|
|
btrfs_extent_item_to_extent_map(inode, path, item,
|
2017-02-20 11:51:02 +00:00
|
|
|
new_inline, em);
|
2014-06-09 02:48:05 +00:00
|
|
|
|
2008-10-30 18:25:28 +00:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2007-08-27 20:49:44 +00:00
|
|
|
goto insert;
|
|
|
|
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-15 20:14:19 +00:00
|
|
|
unsigned long ptr;
|
2007-08-27 20:49:44 +00:00
|
|
|
char *map;
|
2007-10-15 20:18:25 +00:00
|
|
|
size_t size;
|
|
|
|
size_t extent_offset;
|
|
|
|
size_t copy_size;
|
2007-08-27 20:49:44 +00:00
|
|
|
|
2014-06-09 02:48:05 +00:00
|
|
|
if (new_inline)
|
2007-10-29 15:41:07 +00:00
|
|
|
goto out;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2018-06-06 07:41:49 +00:00
|
|
|
size = btrfs_file_extent_ram_bytes(leaf, item);
|
2008-10-30 18:19:41 +00:00
|
|
|
extent_offset = page_offset(page) + pg_offset - extent_start;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
copy_size = min_t(u64, PAGE_SIZE - pg_offset,
|
|
|
|
size - extent_offset);
|
2007-10-15 20:18:25 +00:00
|
|
|
em->start = extent_start + extent_offset;
|
2016-06-22 22:54:23 +00:00
|
|
|
em->len = ALIGN(copy_size, fs_info->sectorsize);
|
2012-12-03 15:31:19 +00:00
|
|
|
em->orig_block_len = em->len;
|
2012-10-11 20:54:30 +00:00
|
|
|
em->orig_start = em->start;
|
2007-10-29 15:41:07 +00:00
|
|
|
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
|
2017-11-20 20:24:49 +00:00
|
|
|
if (!PageUptodate(page)) {
|
2010-12-17 06:21:50 +00:00
|
|
|
if (btrfs_file_extent_compression(leaf, item) !=
|
|
|
|
BTRFS_COMPRESS_NONE) {
|
2015-05-19 14:46:45 +00:00
|
|
|
ret = uncompress_inline(path, page, pg_offset,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
extent_offset, item);
|
2014-05-09 21:15:10 +00:00
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
} else {
|
|
|
|
map = kmap(page);
|
|
|
|
read_extent_buffer(leaf, map + pg_offset, ptr,
|
|
|
|
copy_size);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
if (pg_offset + copy_size < PAGE_SIZE) {
|
2009-09-11 16:36:29 +00:00
|
|
|
memset(map + pg_offset + copy_size, 0,
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
PAGE_SIZE - pg_offset -
|
2009-09-11 16:36:29 +00:00
|
|
|
copy_size);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
kunmap(page);
|
|
|
|
}
|
2007-11-01 15:28:41 +00:00
|
|
|
flush_dcache_page(page);
|
2007-08-27 20:49:44 +00:00
|
|
|
}
|
2008-01-24 21:13:08 +00:00
|
|
|
set_extent_uptodate(io_tree, em->start,
|
2011-04-06 10:02:20 +00:00
|
|
|
extent_map_end(em) - 1, NULL, GFP_NOFS);
|
2007-08-27 20:49:44 +00:00
|
|
|
goto insert;
|
|
|
|
}
|
|
|
|
not_found:
|
|
|
|
em->start = start;
|
2012-10-11 20:54:30 +00:00
|
|
|
em->orig_start = start;
|
2008-01-24 21:13:08 +00:00
|
|
|
em->len = len;
|
2007-08-27 20:49:44 +00:00
|
|
|
not_found_em:
|
2007-10-15 20:14:19 +00:00
|
|
|
em->block_start = EXTENT_MAP_HOLE;
|
2007-08-27 20:49:44 +00:00
|
|
|
insert:
|
2011-04-20 23:20:15 +00:00
|
|
|
btrfs_release_path(path);
|
2008-01-24 21:13:08 +00:00
|
|
|
if (em->start > start || extent_map_end(em) <= start) {
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 14:05:00 +00:00
|
|
|
"bad extent! em: [%llu %llu] passed [%llu %llu]",
|
|
|
|
em->start, em->len, start, len);
|
2007-08-27 20:49:44 +00:00
|
|
|
err = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
2008-01-24 21:13:08 +00:00
|
|
|
|
|
|
|
err = 0;
|
2009-09-02 20:24:52 +00:00
|
|
|
write_lock(&em_tree->lock);
|
2018-04-03 19:45:57 +00:00
|
|
|
err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
|
2009-09-02 20:24:52 +00:00
|
|
|
write_unlock(&em_tree->lock);
|
2007-08-27 20:49:44 +00:00
|
|
|
out:
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2017-02-20 11:51:06 +00:00
|
|
|
trace_btrfs_get_extent(root, inode, em);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
|
|
|
|
2015-08-19 05:55:00 +00:00
|
|
|
btrfs_free_path(path);
|
2007-08-27 20:49:44 +00:00
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(!em); /* Error is always set */
|
2007-08-27 20:49:44 +00:00
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:51:06 +00:00
|
|
|
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
|
|
|
|
struct page *page,
|
|
|
|
size_t pg_offset, u64 start, u64 len,
|
|
|
|
int create)
|
2011-02-23 21:23:20 +00:00
|
|
|
{
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map *hole_em = NULL;
|
|
|
|
u64 range_start = start;
|
|
|
|
u64 end;
|
|
|
|
u64 found;
|
|
|
|
u64 found_end;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
|
|
|
|
if (IS_ERR(em))
|
|
|
|
return em;
|
2017-04-11 08:57:15 +00:00
|
|
|
/*
|
|
|
|
* If our em maps to:
|
|
|
|
* - a hole or
|
|
|
|
* - a pre-alloc extent,
|
|
|
|
* there might actually be delalloc bytes behind it.
|
|
|
|
*/
|
|
|
|
if (em->block_start != EXTENT_MAP_HOLE &&
|
|
|
|
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
return em;
|
|
|
|
else
|
|
|
|
hole_em = em;
|
2011-02-23 21:23:20 +00:00
|
|
|
|
|
|
|
/* check to see if we've wrapped (len == -1 or similar) */
|
|
|
|
end = start + len;
|
|
|
|
if (end < start)
|
|
|
|
end = (u64)-1;
|
|
|
|
else
|
|
|
|
end -= 1;
|
|
|
|
|
|
|
|
em = NULL;
|
|
|
|
|
|
|
|
/* ok, we didn't find anything, lets look for delalloc */
|
2017-02-20 11:51:06 +00:00
|
|
|
found = count_range_bits(&inode->io_tree, &range_start,
|
2011-02-23 21:23:20 +00:00
|
|
|
end, len, EXTENT_DELALLOC, 1);
|
|
|
|
found_end = range_start + found;
|
|
|
|
if (found_end < range_start)
|
|
|
|
found_end = (u64)-1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we didn't find anything useful, return
|
|
|
|
* the original results from get_extent()
|
|
|
|
*/
|
|
|
|
if (range_start > end || found_end <= start) {
|
|
|
|
em = hole_em;
|
|
|
|
hole_em = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* adjust the range_start to make sure it doesn't
|
|
|
|
* go backwards from the start they passed in
|
|
|
|
*/
|
2013-10-31 05:03:04 +00:00
|
|
|
range_start = max(start, range_start);
|
2011-02-23 21:23:20 +00:00
|
|
|
found = found_end - range_start;
|
|
|
|
|
|
|
|
if (found > 0) {
|
|
|
|
u64 hole_start = start;
|
|
|
|
u64 hole_len = len;
|
|
|
|
|
2011-04-20 22:48:27 +00:00
|
|
|
em = alloc_extent_map();
|
2011-02-23 21:23:20 +00:00
|
|
|
if (!em) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* when btrfs_get_extent can't find anything it
|
|
|
|
* returns one huge hole
|
|
|
|
*
|
|
|
|
* make sure what it found really fits our range, and
|
|
|
|
* adjust to make sure it is based on the start from
|
|
|
|
* the caller
|
|
|
|
*/
|
|
|
|
if (hole_em) {
|
|
|
|
u64 calc_end = extent_map_end(hole_em);
|
|
|
|
|
|
|
|
if (calc_end <= start || (hole_em->start > end)) {
|
|
|
|
free_extent_map(hole_em);
|
|
|
|
hole_em = NULL;
|
|
|
|
} else {
|
|
|
|
hole_start = max(hole_em->start, start);
|
|
|
|
hole_len = calc_end - hole_start;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
em->bdev = NULL;
|
|
|
|
if (hole_em && range_start > hole_start) {
|
|
|
|
/* our hole starts before our delalloc, so we
|
|
|
|
* have to return just the parts of the hole
|
|
|
|
* that go until the delalloc starts
|
|
|
|
*/
|
|
|
|
em->len = min(hole_len,
|
|
|
|
range_start - hole_start);
|
|
|
|
em->start = hole_start;
|
|
|
|
em->orig_start = hole_start;
|
|
|
|
/*
|
|
|
|
* don't adjust block start at all,
|
|
|
|
* it is fixed at EXTENT_MAP_HOLE
|
|
|
|
*/
|
|
|
|
em->block_start = hole_em->block_start;
|
|
|
|
em->block_len = hole_len;
|
2013-01-07 10:10:12 +00:00
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
|
|
|
|
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
|
2011-02-23 21:23:20 +00:00
|
|
|
} else {
|
|
|
|
em->start = range_start;
|
|
|
|
em->len = found;
|
|
|
|
em->orig_start = range_start;
|
|
|
|
em->block_start = EXTENT_MAP_DELALLOC;
|
|
|
|
em->block_len = found;
|
|
|
|
}
|
2017-12-01 09:19:43 +00:00
|
|
|
} else {
|
2011-02-23 21:23:20 +00:00
|
|
|
return hole_em;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
|
|
|
|
free_extent_map(hole_em);
|
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2016-05-12 12:53:36 +00:00
|
|
|
static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
|
|
|
|
const u64 start,
|
|
|
|
const u64 len,
|
|
|
|
const u64 orig_start,
|
|
|
|
const u64 block_start,
|
|
|
|
const u64 block_len,
|
|
|
|
const u64 orig_block_len,
|
|
|
|
const u64 ram_bytes,
|
|
|
|
const int type)
|
|
|
|
{
|
|
|
|
struct extent_map *em = NULL;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (type != BTRFS_ORDERED_NOCOW) {
|
2017-01-31 15:50:22 +00:00
|
|
|
em = create_io_em(inode, start, len, orig_start,
|
|
|
|
block_start, block_len, orig_block_len,
|
|
|
|
ram_bytes,
|
|
|
|
BTRFS_COMPRESS_NONE, /* compress_type */
|
|
|
|
type);
|
2016-05-12 12:53:36 +00:00
|
|
|
if (IS_ERR(em))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
|
|
|
|
len, block_len, type);
|
|
|
|
if (ret) {
|
|
|
|
if (em) {
|
|
|
|
free_extent_map(em);
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), start,
|
2016-05-12 12:53:36 +00:00
|
|
|
start + len - 1, 0);
|
|
|
|
}
|
|
|
|
em = ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
|
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2010-05-23 15:00:55 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2012-10-11 20:54:30 +00:00
|
|
|
struct extent_map *em;
|
2010-05-23 15:00:55 +00:00
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 alloc_hint;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, len);
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
|
2016-06-15 13:22:56 +00:00
|
|
|
0, alloc_hint, &ins, 1, 1);
|
2013-08-14 18:02:47 +00:00
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
2010-05-23 15:00:55 +00:00
|
|
|
|
2016-05-12 12:53:36 +00:00
|
|
|
em = btrfs_create_dio_extent(inode, start, ins.offset, start,
|
|
|
|
ins.objectid, ins.offset, ins.offset,
|
2017-02-21 20:12:58 +00:00
|
|
|
ins.offset, BTRFS_ORDERED_REGULAR);
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2016-05-12 12:53:36 +00:00
|
|
|
if (IS_ERR(em))
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid,
|
|
|
|
ins.offset, 1);
|
2016-01-21 10:17:54 +00:00
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2010-05-26 15:04:10 +00:00
|
|
|
/*
|
|
|
|
* returns 1 when the nocow is safe, < 1 on error, 0 if the
|
|
|
|
* block must be cow'd
|
|
|
|
*/
|
2013-08-14 18:02:47 +00:00
|
|
|
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
2013-06-21 20:37:03 +00:00
|
|
|
u64 *orig_start, u64 *orig_block_len,
|
|
|
|
u64 *ram_bytes)
|
2010-05-26 15:04:10 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2010-05-26 15:04:10 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2014-02-27 05:58:05 +00:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-05-26 15:04:10 +00:00
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 backref_offset;
|
|
|
|
u64 extent_end;
|
|
|
|
u64 num_bytes;
|
|
|
|
int slot;
|
|
|
|
int found_type;
|
2013-06-21 20:37:03 +00:00
|
|
|
bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
|
2013-12-27 13:11:50 +00:00
|
|
|
|
2010-05-26 15:04:10 +00:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-01-20 13:54:07 +00:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path,
|
|
|
|
btrfs_ino(BTRFS_I(inode)), offset, 0);
|
2010-05-26 15:04:10 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
slot = path->slots[0];
|
|
|
|
if (ret == 1) {
|
|
|
|
if (slot == 0) {
|
|
|
|
/* can't find the item, must cow */
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
slot--;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
2017-01-10 18:35:31 +00:00
|
|
|
if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
|
2010-05-26 15:04:10 +00:00
|
|
|
key.type != BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
/* not our file or wrong item type, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (key.offset > offset) {
|
|
|
|
/* Wrong offset, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
|
|
|
|
found_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
if (found_type != BTRFS_FILE_EXTENT_REG &&
|
|
|
|
found_type != BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
/* not a regular extent, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
2013-06-21 20:37:03 +00:00
|
|
|
|
|
|
|
if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
|
|
|
|
goto out;
|
|
|
|
|
2013-12-27 13:11:50 +00:00
|
|
|
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
|
|
|
|
if (extent_end <= offset)
|
|
|
|
goto out;
|
|
|
|
|
2010-05-26 15:04:10 +00:00
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
2013-06-21 20:37:03 +00:00
|
|
|
if (disk_bytenr == 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (btrfs_file_extent_compression(leaf, fi) ||
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) ||
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
goto out;
|
|
|
|
|
2018-05-17 06:58:29 +00:00
|
|
|
/*
|
|
|
|
* Do the same check as in btrfs_cross_ref_exist but without the
|
|
|
|
* unnecessary search.
|
|
|
|
*/
|
|
|
|
if (btrfs_file_extent_generation(leaf, fi) <=
|
|
|
|
btrfs_root_last_snapshot(&root->root_item))
|
|
|
|
goto out;
|
|
|
|
|
2010-05-26 15:04:10 +00:00
|
|
|
backref_offset = btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
2013-06-21 20:37:03 +00:00
|
|
|
if (orig_start) {
|
|
|
|
*orig_start = key.offset - backref_offset;
|
|
|
|
*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
|
|
|
|
*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
|
|
|
|
}
|
2013-04-24 20:32:55 +00:00
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
if (btrfs_extent_readonly(fs_info, disk_bytenr))
|
2010-05-26 15:04:10 +00:00
|
|
|
goto out;
|
2014-02-27 05:58:05 +00:00
|
|
|
|
|
|
|
num_bytes = min(offset + *len, extent_end) - offset;
|
|
|
|
if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
u64 range_end;
|
|
|
|
|
2016-06-15 13:22:56 +00:00
|
|
|
range_end = round_up(offset + num_bytes,
|
|
|
|
root->fs_info->sectorsize) - 1;
|
2014-02-27 05:58:05 +00:00
|
|
|
ret = test_range_bit(io_tree, offset, range_end,
|
|
|
|
EXTENT_DELALLOC, 0, NULL);
|
|
|
|
if (ret) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-18 16:10:36 +00:00
|
|
|
btrfs_release_path(path);
|
2010-05-26 15:04:10 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* look for other files referencing this extent, if we
|
|
|
|
* find any we must cow
|
|
|
|
*/
|
2013-08-14 18:02:47 +00:00
|
|
|
|
2017-01-30 20:25:28 +00:00
|
|
|
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
|
2013-08-14 18:02:47 +00:00
|
|
|
key.offset - backref_offset, disk_bytenr);
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2010-05-26 15:04:10 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* adjust disk_bytenr and num_bytes to cover just the bytes
|
|
|
|
* in this extent we are about to write. If there
|
|
|
|
* are any csums in that range we have to cow in order
|
|
|
|
* to keep the csums correct
|
|
|
|
*/
|
|
|
|
disk_bytenr += backref_offset;
|
|
|
|
disk_bytenr += offset - key.offset;
|
2016-06-22 22:54:24 +00:00
|
|
|
if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
|
|
|
|
goto out;
|
2010-05-26 15:04:10 +00:00
|
|
|
/*
|
|
|
|
* all of the above have passed, it is safe to overwrite this extent
|
|
|
|
* without cow
|
|
|
|
*/
|
2013-04-24 20:32:55 +00:00
|
|
|
*len = num_bytes;
|
2010-05-26 15:04:10 +00:00
|
|
|
ret = 1;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-07-31 20:28:48 +00:00
|
|
|
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
|
|
|
|
struct extent_state **cached_state, int writing)
|
|
|
|
{
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
2015-12-03 13:30:40 +00:00
|
|
|
cached_state);
|
2012-07-31 20:28:48 +00:00
|
|
|
/*
|
|
|
|
* We're concerned with the entire range that we're going to be
|
2016-05-20 01:18:45 +00:00
|
|
|
* doing DIO to, so we need to make sure there's no ordered
|
2012-07-31 20:28:48 +00:00
|
|
|
* extents in this range.
|
|
|
|
*/
|
2017-02-20 11:50:49 +00:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
|
2012-07-31 20:28:48 +00:00
|
|
|
lockend - lockstart + 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to make sure there are no buffered pages in this
|
|
|
|
* range either, we could have raced between the invalidate in
|
|
|
|
* generic_file_direct_write and locking the extent. The
|
|
|
|
* invalidate needs to happen so that reads after a write do not
|
|
|
|
* get stale data.
|
|
|
|
*/
|
2014-05-20 20:07:56 +00:00
|
|
|
if (!ordered &&
|
2018-03-07 14:33:22 +00:00
|
|
|
(!writing || !filemap_range_has_page(inode->i_mapping,
|
|
|
|
lockstart, lockend)))
|
2012-07-31 20:28:48 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
2017-12-12 20:43:52 +00:00
|
|
|
cached_state);
|
2012-07-31 20:28:48 +00:00
|
|
|
|
|
|
|
if (ordered) {
|
Btrfs: fix deadlock between direct IO reads and buffered writes
While running a test with a mix of buffered IO and direct IO against
the same files I hit a deadlock reported by the following trace:
[11642.140352] INFO: task kworker/u32:3:15282 blocked for more than 120 seconds.
[11642.142452] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.143982] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.146332] kworker/u32:3 D ffff880230ef7988 [11642.147737] systemd-journald[571]: Sent WATCHDOG=1 notification.
[11642.149771] 0 15282 2 0x00000000
[11642.151205] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs]
[11642.154074] ffff880230ef7988 0000000000000246 0000000000014ec0 ffff88023ec94ec0
[11642.156722] ffff880233fe8f80 ffff880230ef8000 ffff88023ec94ec0 7fffffffffffffff
[11642.159205] 0000000000000002 ffffffff8147b7f9 ffff880230ef79a0 ffffffff8147b541
[11642.161403] Call Trace:
[11642.162129] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.163396] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.164871] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.167020] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.167931] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.182320] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.183762] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.185308] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.186782] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.188217] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.189626] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.190803] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.192158] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.193379] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.194831] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.197068] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.199188] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.200723] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.202465] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.203836] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.205624] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.207057] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.208529] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.210375] [<ffffffffa0462613>] ? btrfs_scrubparity_helper+0x140/0x33a [btrfs]
[11642.212132] [<ffffffffa044f974>] btrfs_run_ordered_extent_work+0x25/0x34 [btrfs]
[11642.213837] [<ffffffffa046262f>] btrfs_scrubparity_helper+0x15c/0x33a [btrfs]
[11642.215457] [<ffffffffa046293b>] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
[11642.217095] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.218324] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.219466] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.220801] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.222032] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.223190] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.224394] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.226295] 2 locks held by kworker/u32:3/15282:
[11642.227273] #0: ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.229412] #1: ((&work->normal_work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.231414] INFO: task kworker/u32:8:15289 blocked for more than 120 seconds.
[11642.232872] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.234109] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.235776] kworker/u32:8 D ffff88020de5f848 0 15289 2 0x00000000
[11642.237412] Workqueue: writeback wb_workfn (flush-btrfs-481)
[11642.238670] ffff88020de5f848 0000000000000246 0000000000014ec0 ffff88023ed54ec0
[11642.240475] ffff88021b1ece40 ffff88020de60000 ffff88023ed54ec0 7fffffffffffffff
[11642.242154] 0000000000000002 ffffffff8147b7f9 ffff88020de5f860 ffffffff8147b541
[11642.243715] Call Trace:
[11642.244390] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.245432] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.246392] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.247479] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.248551] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.249968] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.251043] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.252202] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.253210] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.254307] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.256118] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.257131] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.258200] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.259168] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.260516] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.261841] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.263531] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.264747] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.266148] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.267264] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.268280] [<ffffffff81192a2b>] __writeback_single_inode+0xda/0x5ba
[11642.269407] [<ffffffff811939f0>] writeback_sb_inodes+0x27b/0x43d
[11642.270476] [<ffffffff81193c28>] __writeback_inodes_wb+0x76/0xae
[11642.271547] [<ffffffff81193ea6>] wb_writeback+0x19e/0x41c
[11642.272588] [<ffffffff81194821>] wb_workfn+0x201/0x341
[11642.273523] [<ffffffff81194821>] ? wb_workfn+0x201/0x341
[11642.274479] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.275497] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.276518] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.277520] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.278517] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.279371] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.280468] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.281607] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.282604] 3 locks held by kworker/u32:8/15289:
[11642.283423] #0: ("writeback"){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.285629] #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.287538] #2: (&type->s_umount_key#37){+++++.}, at: [<ffffffff81171217>] trylock_super+0x1b/0x4b
[11642.289423] INFO: task fdm-stress:26848 blocked for more than 120 seconds.
[11642.290547] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.291453] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.292864] fdm-stress D ffff88022c107c20 0 26848 26591 0x00000000
[11642.294118] ffff88022c107c20 000000038108affa 0000000000014ec0 ffff88023ed54ec0
[11642.295602] ffff88013ab1ca40 ffff88022c108000 ffff8800b2fc19d0 00000000000e0fff
[11642.297098] ffff8800b2fc19b0 ffff88022c107c88 ffff88022c107c38 ffffffff8147b541
[11642.298433] Call Trace:
[11642.298896] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.299738] [<ffffffffa045225d>] lock_extent_bits+0xfe/0x1a3 [btrfs]
[11642.300833] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.301943] [<ffffffffa0447516>] lock_and_cleanup_extent_if_need+0x68/0x18e [btrfs]
[11642.303270] [<ffffffffa04485ba>] __btrfs_buffered_write+0x238/0x4c1 [btrfs]
[11642.304552] [<ffffffffa044b50a>] ? btrfs_file_write_iter+0x17c/0x408 [btrfs]
[11642.305782] [<ffffffffa044b682>] btrfs_file_write_iter+0x2f4/0x408 [btrfs]
[11642.306878] [<ffffffff8116e298>] __vfs_write+0x7c/0xa5
[11642.307729] [<ffffffff8116e7d1>] vfs_write+0x9d/0xe8
[11642.308602] [<ffffffff8116efbb>] SyS_write+0x50/0x7e
[11642.309410] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.310403] 3 locks held by fdm-stress/26848:
[11642.311108] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.312578] #1: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.314170] #2: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa044b401>] btrfs_file_write_iter+0x73/0x408 [btrfs]
[11642.316796] INFO: task fdm-stress:26849 blocked for more than 120 seconds.
[11642.317842] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.318691] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.319959] fdm-stress D ffff8801964ffa68 0 26849 26591 0x00000000
[11642.321312] ffff8801964ffa68 00ff8801e9975f80 0000000000014ec0 ffff88023ed94ec0
[11642.322555] ffff8800b00b4840 ffff880196500000 ffff8801e9975f20 0000000000000002
[11642.323715] ffff8801e9975f18 ffff8800b00b4840 ffff8801964ffa80 ffffffff8147b541
[11642.325096] Call Trace:
[11642.325532] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.326303] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.327180] [<ffffffff8108ae40>] ? mark_held_locks+0x5e/0x74
[11642.328114] [<ffffffff8147f30e>] ? _raw_spin_unlock_irq+0x2c/0x4a
[11642.329051] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.330053] [<ffffffff8147bceb>] __wait_for_common+0x109/0x147
[11642.330952] [<ffffffff8147bceb>] ? __wait_for_common+0x109/0x147
[11642.331869] [<ffffffff8147e7bb>] ? usleep_range+0x4a/0x4a
[11642.332925] [<ffffffff81074075>] ? wake_up_q+0x47/0x47
[11642.333736] [<ffffffff8147bd4d>] wait_for_completion+0x24/0x26
[11642.334672] [<ffffffffa044f5ce>] btrfs_wait_ordered_extents+0x1c8/0x217 [btrfs]
[11642.335858] [<ffffffffa0465b5a>] btrfs_mksubvol+0x224/0x45d [btrfs]
[11642.336854] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.337820] [<ffffffffa0465edb>] btrfs_ioctl_snap_create_transid+0x148/0x17a [btrfs]
[11642.339026] [<ffffffffa046603b>] btrfs_ioctl_snap_create_v2+0xc7/0x110 [btrfs]
[11642.340214] [<ffffffffa0468582>] btrfs_ioctl+0x590/0x27bd [btrfs]
[11642.341123] [<ffffffff8147dc00>] ? mutex_unlock+0xe/0x10
[11642.341934] [<ffffffffa00fa6e9>] ? ext4_file_write_iter+0x2a3/0x36f [ext4]
[11642.342936] [<ffffffff8108895d>] ? __lock_is_held+0x3c/0x57
[11642.343772] [<ffffffff81186a1d>] ? rcu_read_unlock+0x3e/0x5d
[11642.344673] [<ffffffff8117dc95>] do_vfs_ioctl+0x458/0x4dc
[11642.346024] [<ffffffff81186bbe>] ? __fget_light+0x62/0x71
[11642.346873] [<ffffffff8117dd70>] SyS_ioctl+0x57/0x79
[11642.347720] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.350222] 4 locks held by fdm-stress/26849:
[11642.350898] #0: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.352375] #1: (&type->i_mutex_dir_key#4/1){+.+.+.}, at: [<ffffffffa0465981>] btrfs_mksubvol+0x4b/0x45d [btrfs]
[11642.354072] #2: (&fs_info->subvol_sem){++++..}, at: [<ffffffffa0465a2a>] btrfs_mksubvol+0xf4/0x45d [btrfs]
[11642.355647] #3: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.357516] INFO: task fdm-stress:26850 blocked for more than 120 seconds.
[11642.358508] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.359376] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.368625] fdm-stress D ffff88021f167688 0 26850 26591 0x00000000
[11642.369716] ffff88021f167688 0000000000000001 0000000000014ec0 ffff88023edd4ec0
[11642.370950] ffff880128a98680 ffff88021f168000 ffff88023edd4ec0 7fffffffffffffff
[11642.372210] 0000000000000002 ffffffff8147b7f9 ffff88021f1676a0 ffffffff8147b541
[11642.373430] Call Trace:
[11642.373853] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.374623] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.375948] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.376862] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.377637] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.378610] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.379457] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.380366] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.381353] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.382255] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.383162] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.383945] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.384875] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.385749] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.386721] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.387596] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.389030] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.389973] [<ffffffff810a25ad>] ? rcu_read_lock_sched_held+0x61/0x69
[11642.390939] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.392271] [<ffffffffa0451c32>] ? __clear_extent_bit+0x26e/0x2c0 [btrfs]
[11642.393305] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.394239] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.395045] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.395991] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.397144] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.398392] [<ffffffffa0452094>] ? clear_extent_bit+0x17/0x19 [btrfs]
[11642.399363] [<ffffffffa0445945>] btrfs_get_blocks_direct+0x12b/0x61c [btrfs]
[11642.400445] [<ffffffff8119f7a1>] ? dio_bio_add_page+0x3d/0x54
[11642.401309] [<ffffffff8119fa93>] ? submit_page_section+0x7b/0x111
[11642.402213] [<ffffffff811a0258>] do_blockdev_direct_IO+0x685/0xc24
[11642.403139] [<ffffffffa044581a>] ? btrfs_page_exists_in_range+0x1a1/0x1a1 [btrfs]
[11642.404360] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.406187] [<ffffffff811a0828>] __blockdev_direct_IO+0x31/0x33
[11642.407070] [<ffffffff811a0828>] ? __blockdev_direct_IO+0x31/0x33
[11642.407990] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.409192] [<ffffffffa043b4ca>] btrfs_direct_IO+0x1c7/0x27e [btrfs]
[11642.410146] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.411291] [<ffffffff81119a2c>] generic_file_read_iter+0x89/0x4e1
[11642.412263] [<ffffffff8108ac05>] ? mark_lock+0x24/0x201
[11642.413057] [<ffffffff8116e1f8>] __vfs_read+0x79/0x9d
[11642.413897] [<ffffffff8116e6f1>] vfs_read+0x8f/0xd2
[11642.414708] [<ffffffff8116ef3d>] SyS_read+0x50/0x7e
[11642.415573] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.416572] 1 lock held by fdm-stress/26850:
[11642.417345] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.418703] INFO: task fdm-stress:26851 blocked for more than 120 seconds.
[11642.419698] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.420612] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.421807] fdm-stress D ffff880196483d28 0 26851 26591 0x00000000
[11642.422878] ffff880196483d28 00ff8801c8f60740 0000000000014ec0 ffff88023ed94ec0
[11642.424149] ffff8801c8f60740 ffff880196484000 0000000000000246 ffff8801c8f60740
[11642.425374] ffff8801bb711840 ffff8801bb711878 ffff880196483d40 ffffffff8147b541
[11642.426591] Call Trace:
[11642.427013] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.427856] [<ffffffff8147b6d5>] schedule_preempt_disabled+0x18/0x24
[11642.428852] [<ffffffff8147c23a>] mutex_lock_nested+0x1d7/0x3b4
[11642.429743] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.430911] [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.432102] [<ffffffffa044f674>] ? btrfs_wait_ordered_roots+0x57/0x191 [btrfs]
[11642.433259] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.434431] [<ffffffffa044f6ea>] btrfs_wait_ordered_roots+0xcd/0x191 [btrfs]
[11642.436079] [<ffffffffa0410cab>] btrfs_sync_fs+0xe0/0x1ad [btrfs]
[11642.437009] [<ffffffff81197900>] ? SyS_tee+0x23c/0x23c
[11642.437860] [<ffffffff81197920>] sync_fs_one_sb+0x20/0x22
[11642.438723] [<ffffffff81171435>] iterate_supers+0x75/0xc2
[11642.439597] [<ffffffff81197d00>] sys_sync+0x52/0x80
[11642.440454] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.441533] 3 locks held by fdm-stress/26851:
[11642.442370] #0: (&type->s_umount_key#37){+++++.}, at: [<ffffffff8117141f>] iterate_supers+0x5f/0xc2
[11642.444043] #1: (&fs_info->ordered_operations_mutex){+.+...}, at: [<ffffffffa044f661>] btrfs_wait_ordered_roots+0x44/0x191 [btrfs]
[11642.446010] #2: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
This happened because under specific timings the path for direct IO reads
can deadlock with concurrent buffered writes. The diagram below shows how
this happens for an example file that has the following layout:
[ extent A ] [ extent B ] [ ....
0K 4K 8K
CPU 1 CPU 2 CPU 3
DIO read against range
[0K, 8K[ starts
btrfs_direct_IO()
--> calls btrfs_get_blocks_direct()
which finds the extent map for the
extent A and leaves the range
[0K, 4K[ locked in the inode's
io tree
buffered write against
range [4K, 8K[ starts
__btrfs_buffered_write()
--> dirties page at 4K
a user space
task calls sync
for e.g or
writepages() is
invoked by mm
writepages()
run_delalloc_range()
cow_file_range()
--> ordered extent X
for the buffered
write is created
and
writeback starts
--> calls btrfs_get_blocks_direct()
again, without submitting first
a bio for reading extent A, and
finds the extent map for extent B
--> calls lock_extent_direct()
--> locks range [4K, 8K[
--> finds ordered extent X
covering range [4K, 8K[
--> unlocks range [4K, 8K[
buffered write against
range [0K, 8K[ starts
__btrfs_buffered_write()
prepare_pages()
--> locks pages with
offsets 0 and 4K
lock_and_cleanup_extent_if_need()
--> blocks attempting to
lock range [0K, 8K[ in
the inode's io tree,
because the range [0, 4K[
is already locked by the
direct IO task at CPU 1
--> calls
btrfs_start_ordered_extent(oe X)
btrfs_start_ordered_extent(oe X)
--> At this point writeback for ordered
extent X has not finished yet
filemap_fdatawrite_range()
btrfs_writepages()
extent_writepages()
extent_write_cache_pages()
--> finds page with offset 0
with the writeback tag
(and not dirty)
--> tries to lock it
--> deadlock, task at CPU 2
has the page locked and
is blocked on the io range
[0, 4K[ that was locked
earlier by this task
So fix this by falling back to a buffered read in the direct IO read path
when an ordered extent for a buffered write is found.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-18 14:28:55 +00:00
|
|
|
/*
|
|
|
|
* If we are doing a DIO read and the ordered extent we
|
|
|
|
* found is for a buffered write, we can not wait for it
|
|
|
|
* to complete and retry, because if we do so we can
|
|
|
|
* deadlock with concurrent buffered writes on page
|
|
|
|
* locks. This happens only if our DIO read covers more
|
|
|
|
* than one extent map, if at this point has already
|
|
|
|
* created an ordered extent for a previous extent map
|
|
|
|
* and locked its range in the inode's io tree, and a
|
|
|
|
* concurrent write against that previous extent map's
|
|
|
|
* range and this range started (we unlock the ranges
|
|
|
|
* in the io tree only when the bios complete and
|
|
|
|
* buffered writes always lock pages before attempting
|
|
|
|
* to lock range in the io tree).
|
|
|
|
*/
|
|
|
|
if (writing ||
|
|
|
|
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
|
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
|
|
|
else
|
|
|
|
ret = -ENOTBLK;
|
2012-07-31 20:28:48 +00:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
} else {
|
|
|
|
/*
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-08 16:23:16 +00:00
|
|
|
* We could trigger writeback for this range (and wait
|
|
|
|
* for it to complete) and then invalidate the pages for
|
|
|
|
* this range (through invalidate_inode_pages2_range()),
|
|
|
|
* but that can lead us to a deadlock with a concurrent
|
|
|
|
* call to readpages() (a buffered read or a defrag call
|
|
|
|
* triggered a readahead) on a page lock due to an
|
|
|
|
* ordered dio extent we created before but did not have
|
|
|
|
* yet a corresponding bio submitted (whence it can not
|
|
|
|
* complete), which makes readpages() wait for that
|
|
|
|
* ordered extent to complete while holding a lock on
|
|
|
|
* that page.
|
2012-07-31 20:28:48 +00:00
|
|
|
*/
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-08 16:23:16 +00:00
|
|
|
ret = -ENOTBLK;
|
2012-07-31 20:28:48 +00:00
|
|
|
}
|
|
|
|
|
Btrfs: fix deadlock between direct IO reads and buffered writes
While running a test with a mix of buffered IO and direct IO against
the same files I hit a deadlock reported by the following trace:
[11642.140352] INFO: task kworker/u32:3:15282 blocked for more than 120 seconds.
[11642.142452] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.143982] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.146332] kworker/u32:3 D ffff880230ef7988 [11642.147737] systemd-journald[571]: Sent WATCHDOG=1 notification.
[11642.149771] 0 15282 2 0x00000000
[11642.151205] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs]
[11642.154074] ffff880230ef7988 0000000000000246 0000000000014ec0 ffff88023ec94ec0
[11642.156722] ffff880233fe8f80 ffff880230ef8000 ffff88023ec94ec0 7fffffffffffffff
[11642.159205] 0000000000000002 ffffffff8147b7f9 ffff880230ef79a0 ffffffff8147b541
[11642.161403] Call Trace:
[11642.162129] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.163396] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.164871] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.167020] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.167931] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.182320] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.183762] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.185308] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.186782] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.188217] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.189626] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.190803] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.192158] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.193379] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.194831] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.197068] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.199188] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.200723] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.202465] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.203836] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.205624] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.207057] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.208529] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.210375] [<ffffffffa0462613>] ? btrfs_scrubparity_helper+0x140/0x33a [btrfs]
[11642.212132] [<ffffffffa044f974>] btrfs_run_ordered_extent_work+0x25/0x34 [btrfs]
[11642.213837] [<ffffffffa046262f>] btrfs_scrubparity_helper+0x15c/0x33a [btrfs]
[11642.215457] [<ffffffffa046293b>] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
[11642.217095] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.218324] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.219466] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.220801] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.222032] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.223190] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.224394] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.226295] 2 locks held by kworker/u32:3/15282:
[11642.227273] #0: ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.229412] #1: ((&work->normal_work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.231414] INFO: task kworker/u32:8:15289 blocked for more than 120 seconds.
[11642.232872] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.234109] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.235776] kworker/u32:8 D ffff88020de5f848 0 15289 2 0x00000000
[11642.237412] Workqueue: writeback wb_workfn (flush-btrfs-481)
[11642.238670] ffff88020de5f848 0000000000000246 0000000000014ec0 ffff88023ed54ec0
[11642.240475] ffff88021b1ece40 ffff88020de60000 ffff88023ed54ec0 7fffffffffffffff
[11642.242154] 0000000000000002 ffffffff8147b7f9 ffff88020de5f860 ffffffff8147b541
[11642.243715] Call Trace:
[11642.244390] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.245432] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.246392] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.247479] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.248551] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.249968] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.251043] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.252202] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.253210] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.254307] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.256118] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.257131] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.258200] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.259168] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.260516] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.261841] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.263531] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.264747] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.266148] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.267264] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.268280] [<ffffffff81192a2b>] __writeback_single_inode+0xda/0x5ba
[11642.269407] [<ffffffff811939f0>] writeback_sb_inodes+0x27b/0x43d
[11642.270476] [<ffffffff81193c28>] __writeback_inodes_wb+0x76/0xae
[11642.271547] [<ffffffff81193ea6>] wb_writeback+0x19e/0x41c
[11642.272588] [<ffffffff81194821>] wb_workfn+0x201/0x341
[11642.273523] [<ffffffff81194821>] ? wb_workfn+0x201/0x341
[11642.274479] [<ffffffff8106483e>] process_one_work+0x256/0x48b
[11642.275497] [<ffffffff81064f20>] worker_thread+0x1f5/0x2a7
[11642.276518] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.277520] [<ffffffff81064d2b>] ? rescuer_thread+0x289/0x289
[11642.278517] [<ffffffff8106a500>] kthread+0xd4/0xdc
[11642.279371] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.280468] [<ffffffff8147fdef>] ret_from_fork+0x3f/0x70
[11642.281607] [<ffffffff8106a42c>] ? kthread_parkme+0x24/0x24
[11642.282604] 3 locks held by kworker/u32:8/15289:
[11642.283423] #0: ("writeback"){++++.+}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.285629] #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [<ffffffff8106474d>] process_one_work+0x165/0x48b
[11642.287538] #2: (&type->s_umount_key#37){+++++.}, at: [<ffffffff81171217>] trylock_super+0x1b/0x4b
[11642.289423] INFO: task fdm-stress:26848 blocked for more than 120 seconds.
[11642.290547] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.291453] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.292864] fdm-stress D ffff88022c107c20 0 26848 26591 0x00000000
[11642.294118] ffff88022c107c20 000000038108affa 0000000000014ec0 ffff88023ed54ec0
[11642.295602] ffff88013ab1ca40 ffff88022c108000 ffff8800b2fc19d0 00000000000e0fff
[11642.297098] ffff8800b2fc19b0 ffff88022c107c88 ffff88022c107c38 ffffffff8147b541
[11642.298433] Call Trace:
[11642.298896] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.299738] [<ffffffffa045225d>] lock_extent_bits+0xfe/0x1a3 [btrfs]
[11642.300833] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.301943] [<ffffffffa0447516>] lock_and_cleanup_extent_if_need+0x68/0x18e [btrfs]
[11642.303270] [<ffffffffa04485ba>] __btrfs_buffered_write+0x238/0x4c1 [btrfs]
[11642.304552] [<ffffffffa044b50a>] ? btrfs_file_write_iter+0x17c/0x408 [btrfs]
[11642.305782] [<ffffffffa044b682>] btrfs_file_write_iter+0x2f4/0x408 [btrfs]
[11642.306878] [<ffffffff8116e298>] __vfs_write+0x7c/0xa5
[11642.307729] [<ffffffff8116e7d1>] vfs_write+0x9d/0xe8
[11642.308602] [<ffffffff8116efbb>] SyS_write+0x50/0x7e
[11642.309410] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.310403] 3 locks held by fdm-stress/26848:
[11642.311108] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.312578] #1: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.314170] #2: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa044b401>] btrfs_file_write_iter+0x73/0x408 [btrfs]
[11642.316796] INFO: task fdm-stress:26849 blocked for more than 120 seconds.
[11642.317842] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.318691] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.319959] fdm-stress D ffff8801964ffa68 0 26849 26591 0x00000000
[11642.321312] ffff8801964ffa68 00ff8801e9975f80 0000000000014ec0 ffff88023ed94ec0
[11642.322555] ffff8800b00b4840 ffff880196500000 ffff8801e9975f20 0000000000000002
[11642.323715] ffff8801e9975f18 ffff8800b00b4840 ffff8801964ffa80 ffffffff8147b541
[11642.325096] Call Trace:
[11642.325532] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.326303] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.327180] [<ffffffff8108ae40>] ? mark_held_locks+0x5e/0x74
[11642.328114] [<ffffffff8147f30e>] ? _raw_spin_unlock_irq+0x2c/0x4a
[11642.329051] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.330053] [<ffffffff8147bceb>] __wait_for_common+0x109/0x147
[11642.330952] [<ffffffff8147bceb>] ? __wait_for_common+0x109/0x147
[11642.331869] [<ffffffff8147e7bb>] ? usleep_range+0x4a/0x4a
[11642.332925] [<ffffffff81074075>] ? wake_up_q+0x47/0x47
[11642.333736] [<ffffffff8147bd4d>] wait_for_completion+0x24/0x26
[11642.334672] [<ffffffffa044f5ce>] btrfs_wait_ordered_extents+0x1c8/0x217 [btrfs]
[11642.335858] [<ffffffffa0465b5a>] btrfs_mksubvol+0x224/0x45d [btrfs]
[11642.336854] [<ffffffff81082eef>] ? add_wait_queue_exclusive+0x44/0x44
[11642.337820] [<ffffffffa0465edb>] btrfs_ioctl_snap_create_transid+0x148/0x17a [btrfs]
[11642.339026] [<ffffffffa046603b>] btrfs_ioctl_snap_create_v2+0xc7/0x110 [btrfs]
[11642.340214] [<ffffffffa0468582>] btrfs_ioctl+0x590/0x27bd [btrfs]
[11642.341123] [<ffffffff8147dc00>] ? mutex_unlock+0xe/0x10
[11642.341934] [<ffffffffa00fa6e9>] ? ext4_file_write_iter+0x2a3/0x36f [ext4]
[11642.342936] [<ffffffff8108895d>] ? __lock_is_held+0x3c/0x57
[11642.343772] [<ffffffff81186a1d>] ? rcu_read_unlock+0x3e/0x5d
[11642.344673] [<ffffffff8117dc95>] do_vfs_ioctl+0x458/0x4dc
[11642.346024] [<ffffffff81186bbe>] ? __fget_light+0x62/0x71
[11642.346873] [<ffffffff8117dd70>] SyS_ioctl+0x57/0x79
[11642.347720] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.350222] 4 locks held by fdm-stress/26849:
[11642.350898] #0: (sb_writers#11){.+.+.+}, at: [<ffffffff811706ee>] __sb_start_write+0x5f/0xb0
[11642.352375] #1: (&type->i_mutex_dir_key#4/1){+.+.+.}, at: [<ffffffffa0465981>] btrfs_mksubvol+0x4b/0x45d [btrfs]
[11642.354072] #2: (&fs_info->subvol_sem){++++..}, at: [<ffffffffa0465a2a>] btrfs_mksubvol+0xf4/0x45d [btrfs]
[11642.355647] #3: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.357516] INFO: task fdm-stress:26850 blocked for more than 120 seconds.
[11642.358508] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.359376] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.368625] fdm-stress D ffff88021f167688 0 26850 26591 0x00000000
[11642.369716] ffff88021f167688 0000000000000001 0000000000014ec0 ffff88023edd4ec0
[11642.370950] ffff880128a98680 ffff88021f168000 ffff88023edd4ec0 7fffffffffffffff
[11642.372210] 0000000000000002 ffffffff8147b7f9 ffff88021f1676a0 ffffffff8147b541
[11642.373430] Call Trace:
[11642.373853] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.374623] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.375948] [<ffffffff8147e7fe>] schedule_timeout+0x43/0x109
[11642.376862] [<ffffffff8147b7f9>] ? bit_wait+0x2f/0x2f
[11642.377637] [<ffffffff8108afd1>] ? trace_hardirqs_on_caller+0x17b/0x197
[11642.378610] [<ffffffff8108affa>] ? trace_hardirqs_on+0xd/0xf
[11642.379457] [<ffffffff810b079b>] ? timekeeping_get_ns+0xe/0x33
[11642.380366] [<ffffffff810b0f61>] ? ktime_get+0x41/0x52
[11642.381353] [<ffffffff8147ac08>] io_schedule_timeout+0xa0/0x102
[11642.382255] [<ffffffff8147ac08>] ? io_schedule_timeout+0xa0/0x102
[11642.383162] [<ffffffff8147b814>] bit_wait_io+0x1b/0x39
[11642.383945] [<ffffffff8147bb21>] __wait_on_bit_lock+0x4c/0x90
[11642.384875] [<ffffffff8111829f>] __lock_page+0x66/0x68
[11642.385749] [<ffffffff81082f29>] ? autoremove_wake_function+0x3a/0x3a
[11642.386721] [<ffffffffa0450ddd>] lock_page+0x31/0x34 [btrfs]
[11642.387596] [<ffffffffa0454e3b>] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs]
[11642.389030] [<ffffffffa0455373>] extent_writepages+0x4b/0x5c [btrfs]
[11642.389973] [<ffffffff810a25ad>] ? rcu_read_lock_sched_held+0x61/0x69
[11642.390939] [<ffffffffa043c913>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[11642.392271] [<ffffffffa0451c32>] ? __clear_extent_bit+0x26e/0x2c0 [btrfs]
[11642.393305] [<ffffffffa043aa82>] btrfs_writepages+0x28/0x2a [btrfs]
[11642.394239] [<ffffffff811236bc>] do_writepages+0x23/0x2c
[11642.395045] [<ffffffff811198c9>] __filemap_fdatawrite_range+0x5a/0x61
[11642.395991] [<ffffffff81119946>] filemap_fdatawrite_range+0x13/0x15
[11642.397144] [<ffffffffa044f87e>] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs]
[11642.398392] [<ffffffffa0452094>] ? clear_extent_bit+0x17/0x19 [btrfs]
[11642.399363] [<ffffffffa0445945>] btrfs_get_blocks_direct+0x12b/0x61c [btrfs]
[11642.400445] [<ffffffff8119f7a1>] ? dio_bio_add_page+0x3d/0x54
[11642.401309] [<ffffffff8119fa93>] ? submit_page_section+0x7b/0x111
[11642.402213] [<ffffffff811a0258>] do_blockdev_direct_IO+0x685/0xc24
[11642.403139] [<ffffffffa044581a>] ? btrfs_page_exists_in_range+0x1a1/0x1a1 [btrfs]
[11642.404360] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.406187] [<ffffffff811a0828>] __blockdev_direct_IO+0x31/0x33
[11642.407070] [<ffffffff811a0828>] ? __blockdev_direct_IO+0x31/0x33
[11642.407990] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.409192] [<ffffffffa043b4ca>] btrfs_direct_IO+0x1c7/0x27e [btrfs]
[11642.410146] [<ffffffffa043d267>] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs]
[11642.411291] [<ffffffff81119a2c>] generic_file_read_iter+0x89/0x4e1
[11642.412263] [<ffffffff8108ac05>] ? mark_lock+0x24/0x201
[11642.413057] [<ffffffff8116e1f8>] __vfs_read+0x79/0x9d
[11642.413897] [<ffffffff8116e6f1>] vfs_read+0x8f/0xd2
[11642.414708] [<ffffffff8116ef3d>] SyS_read+0x50/0x7e
[11642.415573] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.416572] 1 lock held by fdm-stress/26850:
[11642.417345] #0: (&f->f_pos_lock){+.+.+.}, at: [<ffffffff811877e8>] __fdget_pos+0x3a/0x40
[11642.418703] INFO: task fdm-stress:26851 blocked for more than 120 seconds.
[11642.419698] Not tainted 4.4.0-rc6-btrfs-next-21+ #1
[11642.420612] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[11642.421807] fdm-stress D ffff880196483d28 0 26851 26591 0x00000000
[11642.422878] ffff880196483d28 00ff8801c8f60740 0000000000014ec0 ffff88023ed94ec0
[11642.424149] ffff8801c8f60740 ffff880196484000 0000000000000246 ffff8801c8f60740
[11642.425374] ffff8801bb711840 ffff8801bb711878 ffff880196483d40 ffffffff8147b541
[11642.426591] Call Trace:
[11642.427013] [<ffffffff8147b541>] schedule+0x82/0x9a
[11642.427856] [<ffffffff8147b6d5>] schedule_preempt_disabled+0x18/0x24
[11642.428852] [<ffffffff8147c23a>] mutex_lock_nested+0x1d7/0x3b4
[11642.429743] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.430911] [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.432102] [<ffffffffa044f674>] ? btrfs_wait_ordered_roots+0x57/0x191 [btrfs]
[11642.433259] [<ffffffffa044f456>] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
[11642.434431] [<ffffffffa044f6ea>] btrfs_wait_ordered_roots+0xcd/0x191 [btrfs]
[11642.436079] [<ffffffffa0410cab>] btrfs_sync_fs+0xe0/0x1ad [btrfs]
[11642.437009] [<ffffffff81197900>] ? SyS_tee+0x23c/0x23c
[11642.437860] [<ffffffff81197920>] sync_fs_one_sb+0x20/0x22
[11642.438723] [<ffffffff81171435>] iterate_supers+0x75/0xc2
[11642.439597] [<ffffffff81197d00>] sys_sync+0x52/0x80
[11642.440454] [<ffffffff8147fa97>] entry_SYSCALL_64_fastpath+0x12/0x6b
[11642.441533] 3 locks held by fdm-stress/26851:
[11642.442370] #0: (&type->s_umount_key#37){+++++.}, at: [<ffffffff8117141f>] iterate_supers+0x5f/0xc2
[11642.444043] #1: (&fs_info->ordered_operations_mutex){+.+...}, at: [<ffffffffa044f661>] btrfs_wait_ordered_roots+0x44/0x191 [btrfs]
[11642.446010] #2: (&root->ordered_extent_mutex){+.+...}, at: [<ffffffffa044f456>] btrfs_wait_ordered_extents+0x50/0x217 [btrfs]
This happened because under specific timings the path for direct IO reads
can deadlock with concurrent buffered writes. The diagram below shows how
this happens for an example file that has the following layout:
[ extent A ] [ extent B ] [ ....
0K 4K 8K
CPU 1 CPU 2 CPU 3
DIO read against range
[0K, 8K[ starts
btrfs_direct_IO()
--> calls btrfs_get_blocks_direct()
which finds the extent map for the
extent A and leaves the range
[0K, 4K[ locked in the inode's
io tree
buffered write against
range [4K, 8K[ starts
__btrfs_buffered_write()
--> dirties page at 4K
a user space
task calls sync
for e.g or
writepages() is
invoked by mm
writepages()
run_delalloc_range()
cow_file_range()
--> ordered extent X
for the buffered
write is created
and
writeback starts
--> calls btrfs_get_blocks_direct()
again, without submitting first
a bio for reading extent A, and
finds the extent map for extent B
--> calls lock_extent_direct()
--> locks range [4K, 8K[
--> finds ordered extent X
covering range [4K, 8K[
--> unlocks range [4K, 8K[
buffered write against
range [0K, 8K[ starts
__btrfs_buffered_write()
prepare_pages()
--> locks pages with
offsets 0 and 4K
lock_and_cleanup_extent_if_need()
--> blocks attempting to
lock range [0K, 8K[ in
the inode's io tree,
because the range [0, 4K[
is already locked by the
direct IO task at CPU 1
--> calls
btrfs_start_ordered_extent(oe X)
btrfs_start_ordered_extent(oe X)
--> At this point writeback for ordered
extent X has not finished yet
filemap_fdatawrite_range()
btrfs_writepages()
extent_writepages()
extent_write_cache_pages()
--> finds page with offset 0
with the writeback tag
(and not dirty)
--> tries to lock it
--> deadlock, task at CPU 2
has the page locked and
is blocked on the io range
[0, 4K[ that was locked
earlier by this task
So fix this by falling back to a buffered read in the direct IO read path
when an ordered extent for a buffered write is found.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-18 14:28:55 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
2012-07-31 20:28:48 +00:00
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-01-31 15:50:22 +00:00
|
|
|
/* The callers of this must take lock_extent() */
|
|
|
|
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
|
|
|
|
u64 orig_start, u64 block_start,
|
|
|
|
u64 block_len, u64 orig_block_len,
|
|
|
|
u64 ram_bytes, int compress_type,
|
|
|
|
int type)
|
2012-09-11 19:40:07 +00:00
|
|
|
{
|
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret;
|
|
|
|
|
2017-01-31 15:50:22 +00:00
|
|
|
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
|
|
|
|
type == BTRFS_ORDERED_COMPRESSED ||
|
|
|
|
type == BTRFS_ORDERED_NOCOW ||
|
2017-02-13 23:35:09 +00:00
|
|
|
type == BTRFS_ORDERED_REGULAR);
|
2017-01-31 15:50:22 +00:00
|
|
|
|
2012-09-11 19:40:07 +00:00
|
|
|
em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
em = alloc_extent_map();
|
|
|
|
if (!em)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
em->start = start;
|
|
|
|
em->orig_start = orig_start;
|
|
|
|
em->len = len;
|
|
|
|
em->block_len = block_len;
|
|
|
|
em->block_start = block_start;
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2012-12-03 15:31:19 +00:00
|
|
|
em->orig_block_len = orig_block_len;
|
2013-04-04 18:31:27 +00:00
|
|
|
em->ram_bytes = ram_bytes;
|
2012-10-11 20:54:30 +00:00
|
|
|
em->generation = -1;
|
2012-09-11 19:40:07 +00:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
2017-02-13 23:35:09 +00:00
|
|
|
if (type == BTRFS_ORDERED_PREALLOC) {
|
2012-12-03 15:58:15 +00:00
|
|
|
set_bit(EXTENT_FLAG_FILLING, &em->flags);
|
2017-02-13 23:35:09 +00:00
|
|
|
} else if (type == BTRFS_ORDERED_COMPRESSED) {
|
2017-01-31 15:50:22 +00:00
|
|
|
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
|
|
|
|
em->compress_type = compress_type;
|
|
|
|
}
|
2012-09-11 19:40:07 +00:00
|
|
|
|
|
|
|
do {
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
|
2012-09-11 19:40:07 +00:00
|
|
|
em->start + em->len - 1, 0);
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-05 20:51:15 +00:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
2012-09-11 19:40:07 +00:00
|
|
|
write_unlock(&em_tree->lock);
|
2017-01-31 15:50:22 +00:00
|
|
|
/*
|
|
|
|
* The caller has taken lock_extent(), who could race with us
|
|
|
|
* to add em?
|
|
|
|
*/
|
2012-09-11 19:40:07 +00:00
|
|
|
} while (ret == -EEXIST);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2017-01-31 15:50:22 +00:00
|
|
|
/* em got 2 refs now, callers needs to do free_extent_map once. */
|
2012-09-11 19:40:07 +00:00
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2018-05-02 12:19:32 +00:00
|
|
|
|
|
|
|
static int btrfs_get_blocks_direct_read(struct extent_map *em,
|
|
|
|
struct buffer_head *bh_result,
|
|
|
|
struct inode *inode,
|
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
|
|
|
if (em->block_start == EXTENT_MAP_HOLE ||
|
|
|
|
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
|
|
|
|
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
|
|
|
|
inode->i_blkbits;
|
|
|
|
bh_result->b_size = len;
|
|
|
|
bh_result->b_bdev = em->bdev;
|
|
|
|
set_buffer_mapped(bh_result);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-05-02 12:19:33 +00:00
|
|
|
static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
|
|
|
struct buffer_head *bh_result,
|
|
|
|
struct inode *inode,
|
|
|
|
struct btrfs_dio_data *dio_data,
|
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
struct extent_map *em = *map;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't allocate a new extent in the following cases
|
|
|
|
*
|
|
|
|
* 1) The inode is marked as NODATACOW. In this case we'll just use the
|
|
|
|
* existing extent.
|
|
|
|
* 2) The extent is marked as PREALLOC. We're good to go here and can
|
|
|
|
* just use the extent.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
|
|
|
|
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
em->block_start != EXTENT_MAP_HOLE)) {
|
|
|
|
int type;
|
|
|
|
u64 block_start, orig_start, orig_block_len, ram_bytes;
|
|
|
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
else
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
block_start = em->block_start + (start - em->start);
|
|
|
|
|
|
|
|
if (can_nocow_extent(inode, start, &len, &orig_start,
|
|
|
|
&orig_block_len, &ram_bytes) == 1 &&
|
|
|
|
btrfs_inc_nocow_writers(fs_info, block_start)) {
|
|
|
|
struct extent_map *em2;
|
|
|
|
|
|
|
|
em2 = btrfs_create_dio_extent(inode, start, len,
|
|
|
|
orig_start, block_start,
|
|
|
|
len, orig_block_len,
|
|
|
|
ram_bytes, type);
|
|
|
|
btrfs_dec_nocow_writers(fs_info, block_start);
|
|
|
|
if (type == BTRFS_ORDERED_PREALLOC) {
|
|
|
|
free_extent_map(em);
|
|
|
|
*map = em = em2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (em2 && IS_ERR(em2)) {
|
|
|
|
ret = PTR_ERR(em2);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* For inode marked NODATACOW or extent marked PREALLOC,
|
|
|
|
* use the existing or preallocated extent, so does not
|
|
|
|
* need to adjust btrfs_space_info's bytes_may_use.
|
|
|
|
*/
|
|
|
|
btrfs_free_reserved_data_space_noquota(inode, start,
|
|
|
|
len);
|
|
|
|
goto skip_cow;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* this will cow the extent */
|
|
|
|
len = bh_result->b_size;
|
|
|
|
free_extent_map(em);
|
|
|
|
*map = em = btrfs_new_extent_direct(inode, start, len);
|
|
|
|
if (IS_ERR(em)) {
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
|
|
|
|
skip_cow:
|
|
|
|
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
|
|
|
|
inode->i_blkbits;
|
|
|
|
bh_result->b_size = len;
|
|
|
|
bh_result->b_bdev = em->bdev;
|
|
|
|
set_buffer_mapped(bh_result);
|
|
|
|
|
|
|
|
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Need to update the i_size under the extent lock so buffered
|
|
|
|
* readers will get the updated i_size when we unlock.
|
|
|
|
*/
|
|
|
|
if (!dio_data->overwrite && start + len > i_size_read(inode))
|
|
|
|
i_size_write(inode, start + len);
|
|
|
|
|
|
|
|
WARN_ON(dio_data->reserve < len);
|
|
|
|
dio_data->reserve -= len;
|
|
|
|
dio_data->unsubmitted_oe_range_end = start + len;
|
|
|
|
current->journal_info = dio_data;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2010-05-23 15:00:55 +00:00
|
|
|
struct extent_map *em;
|
2012-07-31 20:28:48 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2015-08-28 15:40:13 +00:00
|
|
|
struct btrfs_dio_data *dio_data = NULL;
|
2010-05-23 15:00:55 +00:00
|
|
|
u64 start = iblock << inode->i_blkbits;
|
2012-07-31 20:28:48 +00:00
|
|
|
u64 lockstart, lockend;
|
2010-05-23 15:00:55 +00:00
|
|
|
u64 len = bh_result->b_size;
|
2012-07-31 20:28:48 +00:00
|
|
|
int unlock_bits = EXTENT_LOCKED;
|
2013-02-07 10:12:07 +00:00
|
|
|
int ret = 0;
|
2012-07-31 20:28:48 +00:00
|
|
|
|
Btrfs: fix wrong outstanding_extents when doing DIO write
When running the 083th case of xfstests on the filesystem with
"compress-force=lzo", the following WARNINGs were triggered.
WARNING: at fs/btrfs/inode.c:7908
WARNING: at fs/btrfs/inode.c:7909
WARNING: at fs/btrfs/inode.c:7911
WARNING: at fs/btrfs/extent-tree.c:4510
WARNING: at fs/btrfs/extent-tree.c:4511
This problem was introduced by the patch "Btrfs: fix deadlock due
to unsubmitted". In this patch, there are two bugs which caused
the above problem.
The 1st one is a off-by-one bug, if the DIO write return 0, it is
also a short write, we need release the reserved space for it. But
we didn't do it in that patch. Fix it by change "ret > 0" to
"ret >= 0".
The 2nd one is ->outstanding_extents was increased twice when
a short write happened. As we know, ->outstanding_extents is
a counter to keep track of the number of extent items we may
use duo to delalloc, when we reserve the free space for a
delalloc write, we assume that the write will introduce just
one extent item, so we increase ->outstanding_extents by 1 at
that time. And then we will increase it every time we split the
write, it is done at the beginning of btrfs_get_blocks_direct().
So when a short write happens, we needn't increase
->outstanding_extents again. But this patch done.
In order to fix the 2nd problem, I re-write the logic for
->outstanding_extents operation. We don't increase it at the
beginning of btrfs_get_blocks_direct(), instead, we just
increase it when the split actually happens.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-02-21 09:48:22 +00:00
|
|
|
if (create)
|
2015-02-11 20:08:58 +00:00
|
|
|
unlock_bits |= EXTENT_DIRTY;
|
Btrfs: fix wrong outstanding_extents when doing DIO write
When running the 083th case of xfstests on the filesystem with
"compress-force=lzo", the following WARNINGs were triggered.
WARNING: at fs/btrfs/inode.c:7908
WARNING: at fs/btrfs/inode.c:7909
WARNING: at fs/btrfs/inode.c:7911
WARNING: at fs/btrfs/extent-tree.c:4510
WARNING: at fs/btrfs/extent-tree.c:4511
This problem was introduced by the patch "Btrfs: fix deadlock due
to unsubmitted". In this patch, there are two bugs which caused
the above problem.
The 1st one is a off-by-one bug, if the DIO write return 0, it is
also a short write, we need release the reserved space for it. But
we didn't do it in that patch. Fix it by change "ret > 0" to
"ret >= 0".
The 2nd one is ->outstanding_extents was increased twice when
a short write happened. As we know, ->outstanding_extents is
a counter to keep track of the number of extent items we may
use duo to delalloc, when we reserve the free space for a
delalloc write, we assume that the write will introduce just
one extent item, so we increase ->outstanding_extents by 1 at
that time. And then we will increase it every time we split the
write, it is done at the beginning of btrfs_get_blocks_direct().
So when a short write happens, we needn't increase
->outstanding_extents again. But this patch done.
In order to fix the 2nd problem, I re-write the logic for
->outstanding_extents operation. We don't increase it at the
beginning of btrfs_get_blocks_direct(), instead, we just
increase it when the split actually happens.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-02-21 09:48:22 +00:00
|
|
|
else
|
2016-06-22 22:54:23 +00:00
|
|
|
len = min_t(u64, len, fs_info->sectorsize);
|
2012-07-31 20:28:48 +00:00
|
|
|
|
2012-08-03 20:49:19 +00:00
|
|
|
lockstart = start;
|
|
|
|
lockend = start + len - 1;
|
|
|
|
|
2015-03-17 14:52:28 +00:00
|
|
|
if (current->journal_info) {
|
|
|
|
/*
|
|
|
|
* Need to pull our outstanding extents and set journal_info to NULL so
|
2016-05-20 01:18:45 +00:00
|
|
|
* that anything that needs to check if there's a transaction doesn't get
|
2015-03-17 14:52:28 +00:00
|
|
|
* confused.
|
|
|
|
*/
|
2015-08-28 15:40:13 +00:00
|
|
|
dio_data = current->journal_info;
|
2015-03-17 14:52:28 +00:00
|
|
|
current->journal_info = NULL;
|
|
|
|
}
|
|
|
|
|
2012-07-31 20:28:48 +00:00
|
|
|
/*
|
|
|
|
* If this errors out it's because we couldn't invalidate pagecache for
|
|
|
|
* this range and we need to fallback to buffered.
|
|
|
|
*/
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 09:52:04 +00:00
|
|
|
if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
|
|
|
|
create)) {
|
|
|
|
ret = -ENOTBLK;
|
|
|
|
goto err;
|
|
|
|
}
|
2012-07-31 20:28:48 +00:00
|
|
|
|
2017-02-20 11:51:06 +00:00
|
|
|
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
|
2012-07-31 20:28:48 +00:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
goto unlock_err;
|
|
|
|
}
|
2010-05-23 15:00:55 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
|
|
|
|
* io. INLINE is special, and we could probably kludge it in here, but
|
|
|
|
* it's still buffered so for safety lets just fall back to the generic
|
|
|
|
* buffered path.
|
|
|
|
*
|
|
|
|
* For COMPRESSED we _have_ to read the entire extent in so we can
|
|
|
|
* decompress it, so there will be buffering required no matter what we
|
|
|
|
* do, so go ahead and fallback to buffered.
|
|
|
|
*
|
2016-05-20 01:18:45 +00:00
|
|
|
* We return -ENOTBLK because that's what makes DIO go ahead and go back
|
2010-05-23 15:00:55 +00:00
|
|
|
* to buffered IO. Don't blame me, this is the price we pay for using
|
|
|
|
* the generic code.
|
|
|
|
*/
|
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
|
|
|
|
em->block_start == EXTENT_MAP_INLINE) {
|
|
|
|
free_extent_map(em);
|
2012-07-31 20:28:48 +00:00
|
|
|
ret = -ENOTBLK;
|
|
|
|
goto unlock_err;
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
2018-05-02 12:19:33 +00:00
|
|
|
if (create) {
|
|
|
|
ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
|
|
|
|
dio_data, start, len);
|
|
|
|
if (ret < 0)
|
|
|
|
goto unlock_err;
|
|
|
|
|
|
|
|
/* clear and unlock the entire range */
|
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
unlock_bits, 1, 0, &cached_state);
|
|
|
|
} else {
|
2018-05-02 12:19:32 +00:00
|
|
|
ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
|
|
|
|
start, len);
|
|
|
|
/* Can be negative only if we read from a hole */
|
|
|
|
if (ret < 0) {
|
|
|
|
ret = 0;
|
|
|
|
free_extent_map(em);
|
|
|
|
goto unlock_err;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We need to unlock only the end area that we aren't using.
|
|
|
|
* The rest is going to be unlocked by the endio routine.
|
|
|
|
*/
|
|
|
|
lockstart = start + bh_result->b_size;
|
|
|
|
if (lockstart < lockend) {
|
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
|
|
|
|
lockend, unlock_bits, 1, 0,
|
|
|
|
&cached_state);
|
|
|
|
} else {
|
|
|
|
free_extent_state(cached_state);
|
|
|
|
}
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
return 0;
|
2012-07-31 20:28:48 +00:00
|
|
|
|
|
|
|
unlock_err:
|
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
2017-10-31 15:37:52 +00:00
|
|
|
unlock_bits, 1, 0, &cached_state);
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 09:52:04 +00:00
|
|
|
err:
|
2015-08-28 15:40:13 +00:00
|
|
|
if (dio_data)
|
|
|
|
current->journal_info = dio_data;
|
2012-07-31 20:28:48 +00:00
|
|
|
return ret;
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
|
|
|
|
struct bio *bio,
|
|
|
|
int mirror_num)
|
2014-09-12 10:44:03 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2017-08-23 06:45:59 +00:00
|
|
|
blk_status_t ret;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2016-06-05 19:31:52 +00:00
|
|
|
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
|
2014-09-12 10:44:03 +00:00
|
|
|
if (ret)
|
2017-12-13 08:25:38 +00:00
|
|
|
return ret;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
|
2017-12-13 08:25:38 +00:00
|
|
|
|
2014-09-12 10:44:03 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_check_dio_repairable(struct inode *inode,
|
|
|
|
struct bio *failed_bio,
|
|
|
|
struct io_failure_record *failrec,
|
|
|
|
int failed_mirror)
|
|
|
|
{
|
2016-09-20 14:05:02 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2014-09-12 10:44:03 +00:00
|
|
|
int num_copies;
|
|
|
|
|
2016-09-20 14:05:02 +00:00
|
|
|
num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
|
2014-09-12 10:44:03 +00:00
|
|
|
if (num_copies == 1) {
|
|
|
|
/*
|
|
|
|
* we only have a single copy of the data, so don't bother with
|
|
|
|
* all the retry and error correction code that follows. no
|
|
|
|
* matter what the error is, it is very likely to persist.
|
|
|
|
*/
|
2016-09-20 14:05:02 +00:00
|
|
|
btrfs_debug(fs_info,
|
|
|
|
"Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
|
|
|
|
num_copies, failrec->this_mirror, failed_mirror);
|
2014-09-12 10:44:03 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
failrec->failed_mirror = failed_mirror;
|
|
|
|
failrec->this_mirror++;
|
|
|
|
if (failrec->this_mirror == failed_mirror)
|
|
|
|
failrec->this_mirror++;
|
|
|
|
|
|
|
|
if (failrec->this_mirror > num_copies) {
|
2016-09-20 14:05:02 +00:00
|
|
|
btrfs_debug(fs_info,
|
|
|
|
"Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
|
|
|
|
num_copies, failrec->this_mirror, failed_mirror);
|
2014-09-12 10:44:03 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
|
|
|
|
struct page *page, unsigned int pgoff,
|
|
|
|
u64 start, u64 end, int failed_mirror,
|
|
|
|
bio_end_io_t *repair_endio, void *repair_arg)
|
2014-09-12 10:44:03 +00:00
|
|
|
{
|
|
|
|
struct io_failure_record *failrec;
|
2017-05-05 15:57:15 +00:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
|
2014-09-12 10:44:03 +00:00
|
|
|
struct bio *bio;
|
|
|
|
int isector;
|
2017-06-06 17:03:49 +00:00
|
|
|
unsigned int read_mode = 0;
|
2017-05-15 22:33:27 +00:00
|
|
|
int segs;
|
2014-09-12 10:44:03 +00:00
|
|
|
int ret;
|
2017-08-23 06:45:59 +00:00
|
|
|
blk_status_t status;
|
2017-12-18 12:22:12 +00:00
|
|
|
struct bio_vec bvec;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2016-06-05 19:31:52 +00:00
|
|
|
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
|
2014-09-12 10:44:03 +00:00
|
|
|
|
|
|
|
ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
|
|
|
|
if (ret)
|
2017-08-23 06:45:59 +00:00
|
|
|
return errno_to_blk_status(ret);
|
2014-09-12 10:44:03 +00:00
|
|
|
|
|
|
|
ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
|
|
|
|
failed_mirror);
|
|
|
|
if (!ret) {
|
2017-05-05 15:57:15 +00:00
|
|
|
free_io_failure(failure_tree, io_tree, failrec);
|
2017-08-23 06:45:59 +00:00
|
|
|
return BLK_STS_IOERR;
|
2014-09-12 10:44:03 +00:00
|
|
|
}
|
|
|
|
|
2017-05-15 22:33:27 +00:00
|
|
|
segs = bio_segments(failed_bio);
|
2017-12-18 12:22:12 +00:00
|
|
|
bio_get_first_bvec(failed_bio, &bvec);
|
2017-05-15 22:33:27 +00:00
|
|
|
if (segs > 1 ||
|
2017-12-18 12:22:12 +00:00
|
|
|
(bvec.bv_len > btrfs_inode_sectorsize(inode)))
|
2016-11-01 13:40:10 +00:00
|
|
|
read_mode |= REQ_FAILFAST_DEV;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
|
|
|
isector = start - btrfs_io_bio(failed_bio)->logical;
|
|
|
|
isector >>= inode->i_sb->s_blocksize_bits;
|
|
|
|
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
|
2016-01-21 10:25:55 +00:00
|
|
|
pgoff, isector, repair_endio, repair_arg);
|
2018-06-29 08:56:53 +00:00
|
|
|
bio->bi_opf = REQ_OP_READ | read_mode;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
|
|
|
btrfs_debug(BTRFS_I(inode)->root->fs_info,
|
2017-07-13 13:32:18 +00:00
|
|
|
"repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
|
2014-09-12 10:44:03 +00:00
|
|
|
read_mode, failrec->this_mirror, failrec->in_validation);
|
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
|
|
|
|
if (status) {
|
2017-05-05 15:57:15 +00:00
|
|
|
free_io_failure(failure_tree, io_tree, failrec);
|
2014-09-12 10:44:03 +00:00
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
return status;
|
2014-09-12 10:44:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct btrfs_retry_complete {
|
|
|
|
struct completion done;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 start;
|
|
|
|
int uptodate;
|
|
|
|
};
|
|
|
|
|
2015-07-20 13:29:37 +00:00
|
|
|
static void btrfs_retry_endio_nocsum(struct bio *bio)
|
2014-09-12 10:44:03 +00:00
|
|
|
{
|
|
|
|
struct btrfs_retry_complete *done = bio->bi_private;
|
2017-05-05 15:57:15 +00:00
|
|
|
struct inode *inode = done->inode;
|
2014-09-12 10:44:03 +00:00
|
|
|
struct bio_vec *bvec;
|
2017-05-05 15:57:15 +00:00
|
|
|
struct extent_io_tree *io_tree, *failure_tree;
|
2014-09-12 10:44:03 +00:00
|
|
|
int i;
|
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
if (bio->bi_status)
|
2014-09-12 10:44:03 +00:00
|
|
|
goto end;
|
|
|
|
|
2016-01-21 10:25:55 +00:00
|
|
|
ASSERT(bio->bi_vcnt == 1);
|
2017-05-05 15:57:15 +00:00
|
|
|
io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
failure_tree = &BTRFS_I(inode)->io_failure_tree;
|
2017-12-18 12:22:04 +00:00
|
|
|
ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
|
2016-01-21 10:25:55 +00:00
|
|
|
|
2014-09-12 10:44:03 +00:00
|
|
|
done->uptodate = 1;
|
2017-07-13 16:10:07 +00:00
|
|
|
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
2014-09-12 10:44:03 +00:00
|
|
|
bio_for_each_segment_all(bvec, bio, i)
|
2017-05-05 15:57:15 +00:00
|
|
|
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
|
|
|
|
io_tree, done->start, bvec->bv_page,
|
|
|
|
btrfs_ino(BTRFS_I(inode)), 0);
|
2014-09-12 10:44:03 +00:00
|
|
|
end:
|
|
|
|
complete(&done->done);
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio)
|
2010-05-23 15:00:55 +00:00
|
|
|
{
|
2016-01-21 10:25:55 +00:00
|
|
|
struct btrfs_fs_info *fs_info;
|
2017-05-15 22:33:27 +00:00
|
|
|
struct bio_vec bvec;
|
|
|
|
struct bvec_iter iter;
|
2014-09-12 10:44:03 +00:00
|
|
|
struct btrfs_retry_complete done;
|
2010-05-23 15:00:55 +00:00
|
|
|
u64 start;
|
2016-01-21 10:25:55 +00:00
|
|
|
unsigned int pgoff;
|
|
|
|
u32 sectorsize;
|
|
|
|
int nr_sectors;
|
2017-08-23 06:45:59 +00:00
|
|
|
blk_status_t ret;
|
|
|
|
blk_status_t err = BLK_STS_OK;
|
2010-05-23 15:00:55 +00:00
|
|
|
|
2016-01-21 10:25:55 +00:00
|
|
|
fs_info = BTRFS_I(inode)->root->fs_info;
|
2016-06-15 13:22:56 +00:00
|
|
|
sectorsize = fs_info->sectorsize;
|
2016-01-21 10:25:55 +00:00
|
|
|
|
2014-09-12 10:44:03 +00:00
|
|
|
start = io_bio->logical;
|
|
|
|
done.inode = inode;
|
2017-05-15 22:33:27 +00:00
|
|
|
io_bio->bio.bi_iter = io_bio->iter;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2017-05-15 22:33:27 +00:00
|
|
|
bio_for_each_segment(bvec, &io_bio->bio, iter) {
|
|
|
|
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
|
|
|
|
pgoff = bvec.bv_offset;
|
2016-01-21 10:25:55 +00:00
|
|
|
|
|
|
|
next_block_or_try_again:
|
2014-09-12 10:44:03 +00:00
|
|
|
done.uptodate = 0;
|
|
|
|
done.start = start;
|
|
|
|
init_completion(&done.done);
|
|
|
|
|
2017-05-15 22:33:27 +00:00
|
|
|
ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
|
2016-01-21 10:25:55 +00:00
|
|
|
pgoff, start, start + sectorsize - 1,
|
|
|
|
io_bio->mirror_num,
|
|
|
|
btrfs_retry_endio_nocsum, &done);
|
2017-05-16 00:20:07 +00:00
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto next;
|
|
|
|
}
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2017-07-19 17:26:45 +00:00
|
|
|
wait_for_completion_io(&done.done);
|
2014-09-12 10:44:03 +00:00
|
|
|
|
|
|
|
if (!done.uptodate) {
|
|
|
|
/* We might have another mirror, so try again */
|
2016-01-21 10:25:55 +00:00
|
|
|
goto next_block_or_try_again;
|
2014-09-12 10:44:03 +00:00
|
|
|
}
|
|
|
|
|
2017-05-16 00:20:07 +00:00
|
|
|
next:
|
2016-01-21 10:25:55 +00:00
|
|
|
start += sectorsize;
|
|
|
|
|
2017-04-07 20:11:10 +00:00
|
|
|
nr_sectors--;
|
|
|
|
if (nr_sectors) {
|
2016-01-21 10:25:55 +00:00
|
|
|
pgoff += sectorsize;
|
2017-04-07 20:11:10 +00:00
|
|
|
ASSERT(pgoff < PAGE_SIZE);
|
2016-01-21 10:25:55 +00:00
|
|
|
goto next_block_or_try_again;
|
|
|
|
}
|
2014-09-12 10:44:03 +00:00
|
|
|
}
|
|
|
|
|
2017-05-16 00:20:07 +00:00
|
|
|
return err;
|
2014-09-12 10:44:03 +00:00
|
|
|
}
|
|
|
|
|
2015-07-20 13:29:37 +00:00
|
|
|
static void btrfs_retry_endio(struct bio *bio)
|
2014-09-12 10:44:03 +00:00
|
|
|
{
|
|
|
|
struct btrfs_retry_complete *done = bio->bi_private;
|
|
|
|
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
|
2017-05-05 15:57:15 +00:00
|
|
|
struct extent_io_tree *io_tree, *failure_tree;
|
|
|
|
struct inode *inode = done->inode;
|
2014-09-12 10:44:03 +00:00
|
|
|
struct bio_vec *bvec;
|
|
|
|
int uptodate;
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
if (bio->bi_status)
|
2014-09-12 10:44:03 +00:00
|
|
|
goto end;
|
|
|
|
|
|
|
|
uptodate = 1;
|
2016-01-21 10:25:55 +00:00
|
|
|
|
|
|
|
ASSERT(bio->bi_vcnt == 1);
|
2017-12-18 12:22:04 +00:00
|
|
|
ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
|
2016-01-21 10:25:55 +00:00
|
|
|
|
2017-05-05 15:57:15 +00:00
|
|
|
io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
failure_tree = &BTRFS_I(inode)->io_failure_tree;
|
|
|
|
|
2017-07-13 16:10:07 +00:00
|
|
|
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
2014-09-12 10:44:03 +00:00
|
|
|
bio_for_each_segment_all(bvec, bio, i) {
|
2017-05-05 15:57:15 +00:00
|
|
|
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
|
|
|
|
bvec->bv_offset, done->start,
|
|
|
|
bvec->bv_len);
|
2014-09-12 10:44:03 +00:00
|
|
|
if (!ret)
|
2017-05-05 15:57:15 +00:00
|
|
|
clean_io_failure(BTRFS_I(inode)->root->fs_info,
|
|
|
|
failure_tree, io_tree, done->start,
|
|
|
|
bvec->bv_page,
|
|
|
|
btrfs_ino(BTRFS_I(inode)),
|
|
|
|
bvec->bv_offset);
|
2014-09-12 10:44:03 +00:00
|
|
|
else
|
|
|
|
uptodate = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
done->uptodate = uptodate;
|
|
|
|
end:
|
|
|
|
complete(&done->done);
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio, blk_status_t err)
|
2014-09-12 10:44:03 +00:00
|
|
|
{
|
2016-01-21 10:25:55 +00:00
|
|
|
struct btrfs_fs_info *fs_info;
|
2017-05-15 22:33:27 +00:00
|
|
|
struct bio_vec bvec;
|
|
|
|
struct bvec_iter iter;
|
2014-09-12 10:44:03 +00:00
|
|
|
struct btrfs_retry_complete done;
|
|
|
|
u64 start;
|
|
|
|
u64 offset = 0;
|
2016-01-21 10:25:55 +00:00
|
|
|
u32 sectorsize;
|
|
|
|
int nr_sectors;
|
|
|
|
unsigned int pgoff;
|
|
|
|
int csum_pos;
|
2017-04-14 01:11:48 +00:00
|
|
|
bool uptodate = (err == 0);
|
2014-09-12 10:44:03 +00:00
|
|
|
int ret;
|
2017-08-23 06:45:59 +00:00
|
|
|
blk_status_t status;
|
2014-09-12 10:43:55 +00:00
|
|
|
|
2016-01-21 10:25:55 +00:00
|
|
|
fs_info = BTRFS_I(inode)->root->fs_info;
|
2016-06-15 13:22:56 +00:00
|
|
|
sectorsize = fs_info->sectorsize;
|
2016-01-21 10:25:55 +00:00
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
err = BLK_STS_OK;
|
2014-09-12 10:43:56 +00:00
|
|
|
start = io_bio->logical;
|
2014-09-12 10:44:03 +00:00
|
|
|
done.inode = inode;
|
2017-05-15 22:33:27 +00:00
|
|
|
io_bio->bio.bi_iter = io_bio->iter;
|
2014-09-12 10:44:03 +00:00
|
|
|
|
2017-05-15 22:33:27 +00:00
|
|
|
bio_for_each_segment(bvec, &io_bio->bio, iter) {
|
|
|
|
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
|
2016-01-21 10:25:55 +00:00
|
|
|
|
2017-05-15 22:33:27 +00:00
|
|
|
pgoff = bvec.bv_offset;
|
2016-01-21 10:25:55 +00:00
|
|
|
next_block:
|
2017-04-14 01:11:48 +00:00
|
|
|
if (uptodate) {
|
|
|
|
csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
|
|
|
|
ret = __readpage_endio_check(inode, io_bio, csum_pos,
|
|
|
|
bvec.bv_page, pgoff, start, sectorsize);
|
|
|
|
if (likely(!ret))
|
|
|
|
goto next;
|
|
|
|
}
|
2014-09-12 10:44:03 +00:00
|
|
|
try_again:
|
|
|
|
done.uptodate = 0;
|
|
|
|
done.start = start;
|
|
|
|
init_completion(&done.done);
|
|
|
|
|
2017-08-23 06:45:59 +00:00
|
|
|
status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
|
|
|
|
pgoff, start, start + sectorsize - 1,
|
|
|
|
io_bio->mirror_num, btrfs_retry_endio,
|
|
|
|
&done);
|
|
|
|
if (status) {
|
|
|
|
err = status;
|
2014-09-12 10:44:03 +00:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
2017-07-19 17:26:45 +00:00
|
|
|
wait_for_completion_io(&done.done);
|
2014-09-12 10:44:03 +00:00
|
|
|
|
|
|
|
if (!done.uptodate) {
|
|
|
|
/* We might have another mirror, so try again */
|
|
|
|
goto try_again;
|
|
|
|
}
|
|
|
|
next:
|
2016-01-21 10:25:55 +00:00
|
|
|
offset += sectorsize;
|
|
|
|
start += sectorsize;
|
|
|
|
|
|
|
|
ASSERT(nr_sectors);
|
|
|
|
|
2017-04-07 20:11:10 +00:00
|
|
|
nr_sectors--;
|
|
|
|
if (nr_sectors) {
|
2016-01-21 10:25:55 +00:00
|
|
|
pgoff += sectorsize;
|
2017-04-07 20:11:10 +00:00
|
|
|
ASSERT(pgoff < PAGE_SIZE);
|
2016-01-21 10:25:55 +00:00
|
|
|
goto next_block;
|
|
|
|
}
|
2013-11-07 20:20:26 +00:00
|
|
|
}
|
2014-09-12 10:43:56 +00:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
static blk_status_t btrfs_subio_endio_read(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio, blk_status_t err)
|
2014-09-12 10:44:03 +00:00
|
|
|
{
|
|
|
|
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
|
|
|
|
|
|
|
if (skip_csum) {
|
|
|
|
if (unlikely(err))
|
|
|
|
return __btrfs_correct_data_nocsum(inode, io_bio);
|
|
|
|
else
|
2017-08-23 06:45:59 +00:00
|
|
|
return BLK_STS_OK;
|
2014-09-12 10:44:03 +00:00
|
|
|
} else {
|
|
|
|
return __btrfs_subio_endio_read(inode, io_bio, err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-20 13:29:37 +00:00
|
|
|
static void btrfs_endio_direct_read(struct bio *bio)
|
2014-09-12 10:43:56 +00:00
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
struct inode *inode = dip->inode;
|
|
|
|
struct bio *dio_bio;
|
|
|
|
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t err = bio->bi_status;
|
2014-09-12 10:43:56 +00:00
|
|
|
|
2017-09-15 21:06:51 +00:00
|
|
|
if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
|
2014-09-12 10:44:03 +00:00
|
|
|
err = btrfs_subio_endio_read(inode, io_bio, err);
|
2014-09-12 10:43:56 +00:00
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
|
2012-03-01 13:57:19 +00:00
|
|
|
dip->logical_offset + dip->bytes - 1);
|
2013-05-17 22:30:14 +00:00
|
|
|
dio_bio = dip->dio_bio;
|
2010-05-23 15:00:55 +00:00
|
|
|
|
|
|
|
kfree(dip);
|
2011-03-22 15:05:07 +00:00
|
|
|
|
2017-09-15 21:06:51 +00:00
|
|
|
dio_bio->bi_status = err;
|
2017-06-03 07:37:58 +00:00
|
|
|
dio_end_io(dio_bio);
|
2014-09-12 10:43:54 +00:00
|
|
|
|
|
|
|
if (io_bio->end_io)
|
2017-06-03 07:38:06 +00:00
|
|
|
io_bio->end_io(io_bio, blk_status_to_errno(err));
|
2013-05-17 22:30:14 +00:00
|
|
|
bio_put(bio);
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
static void __endio_write_update_ordered(struct inode *inode,
|
|
|
|
const u64 offset, const u64 bytes,
|
|
|
|
const bool uptodate)
|
2010-05-23 15:00:55 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2010-05-23 15:00:55 +00:00
|
|
|
struct btrfs_ordered_extent *ordered = NULL;
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
struct btrfs_workqueue *wq;
|
|
|
|
btrfs_work_func_t func;
|
2015-11-24 16:23:54 +00:00
|
|
|
u64 ordered_offset = offset;
|
|
|
|
u64 ordered_bytes = bytes;
|
2017-09-01 08:59:07 +00:00
|
|
|
u64 last_offset;
|
2010-05-23 15:00:55 +00:00
|
|
|
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
|
|
|
|
wq = fs_info->endio_freespace_worker;
|
|
|
|
func = btrfs_freespace_write_helper;
|
|
|
|
} else {
|
|
|
|
wq = fs_info->endio_write_workers;
|
|
|
|
func = btrfs_endio_write_helper;
|
|
|
|
}
|
|
|
|
|
2018-04-11 08:21:17 +00:00
|
|
|
while (ordered_offset < offset + bytes) {
|
|
|
|
last_offset = ordered_offset;
|
|
|
|
if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
|
|
|
|
&ordered_offset,
|
|
|
|
ordered_bytes,
|
|
|
|
uptodate)) {
|
|
|
|
btrfs_init_work(&ordered->work, func,
|
|
|
|
finish_ordered_fn,
|
|
|
|
NULL, NULL);
|
|
|
|
btrfs_queue_work(wq, &ordered->work);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If btrfs_dec_test_ordered_pending does not find any ordered
|
|
|
|
* extent in the range, we can exit.
|
|
|
|
*/
|
|
|
|
if (ordered_offset == last_offset)
|
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* Our bio might span multiple ordered extents. In this case
|
|
|
|
* we keep goin until we have accounted the whole dio.
|
|
|
|
*/
|
|
|
|
if (ordered_offset < offset + bytes) {
|
|
|
|
ordered_bytes = offset + bytes - ordered_offset;
|
|
|
|
ordered = NULL;
|
|
|
|
}
|
2010-11-29 00:56:33 +00:00
|
|
|
}
|
2015-11-24 16:23:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_endio_direct_write(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
struct bio *dio_bio = dip->dio_bio;
|
|
|
|
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
__endio_write_update_ordered(dip->inode, dip->logical_offset,
|
2017-06-03 07:38:06 +00:00
|
|
|
dip->bytes, !bio->bi_status);
|
2010-05-23 15:00:55 +00:00
|
|
|
|
|
|
|
kfree(dip);
|
2011-03-22 15:05:07 +00:00
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
dio_bio->bi_status = bio->bi_status;
|
2017-06-03 07:37:58 +00:00
|
|
|
dio_end_io(dio_bio);
|
2013-05-17 22:30:14 +00:00
|
|
|
bio_put(bio);
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
2018-03-08 13:35:48 +00:00
|
|
|
static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
|
2018-03-08 12:47:33 +00:00
|
|
|
struct bio *bio, u64 offset)
|
2010-05-25 13:48:28 +00:00
|
|
|
{
|
2017-05-05 15:57:13 +00:00
|
|
|
struct inode *inode = private_data;
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t ret;
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_csum_one_bio(inode, bio, offset, 1);
|
2012-03-12 15:03:00 +00:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2010-05-25 13:48:28 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-07-20 13:29:37 +00:00
|
|
|
static void btrfs_end_dio_bio(struct bio *bio)
|
2010-11-22 03:04:43 +00:00
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t err = bio->bi_status;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2014-09-12 10:44:03 +00:00
|
|
|
if (err)
|
|
|
|
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
|
2016-06-05 19:32:21 +00:00
|
|
|
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
|
|
|
|
bio->bi_opf,
|
2014-09-12 10:44:03 +00:00
|
|
|
(unsigned long long)bio->bi_iter.bi_sector,
|
|
|
|
bio->bi_iter.bi_size, err);
|
|
|
|
|
|
|
|
if (dip->subio_endio)
|
|
|
|
err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
|
2014-09-12 10:43:56 +00:00
|
|
|
|
|
|
|
if (err) {
|
2010-11-22 03:04:43 +00:00
|
|
|
/*
|
2018-02-14 08:53:36 +00:00
|
|
|
* We want to perceive the errors flag being set before
|
|
|
|
* decrementing the reference count. We don't need a barrier
|
|
|
|
* since atomic operations with a return value are fully
|
|
|
|
* ordered as per atomic_t.txt
|
2010-11-22 03:04:43 +00:00
|
|
|
*/
|
2018-02-14 08:53:36 +00:00
|
|
|
dip->errors = 1;
|
2010-11-22 03:04:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* if there are more bios still pending for this dio, just exit */
|
|
|
|
if (!atomic_dec_and_test(&dip->pending_bios))
|
|
|
|
goto out;
|
|
|
|
|
2013-05-17 22:30:14 +00:00
|
|
|
if (dip->errors) {
|
2010-11-22 03:04:43 +00:00
|
|
|
bio_io_error(dip->orig_bio);
|
2013-05-17 22:30:14 +00:00
|
|
|
} else {
|
2017-10-14 00:35:56 +00:00
|
|
|
dip->dio_bio->bi_status = BLK_STS_OK;
|
2015-07-20 13:29:37 +00:00
|
|
|
bio_endio(dip->orig_bio);
|
2010-11-22 03:04:43 +00:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
|
2014-09-12 10:43:56 +00:00
|
|
|
struct btrfs_dio_private *dip,
|
|
|
|
struct bio *bio,
|
|
|
|
u64 file_offset)
|
|
|
|
{
|
|
|
|
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
|
|
|
|
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t ret;
|
2014-09-12 10:43:56 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We load all the csum data we need when we submit
|
|
|
|
* the first bio to reduce the csum tree search and
|
|
|
|
* contention.
|
|
|
|
*/
|
|
|
|
if (dip->logical_offset == file_offset) {
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
|
2014-09-12 10:43:56 +00:00
|
|
|
file_offset);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bio == dip->orig_bio)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
file_offset -= dip->logical_offset;
|
|
|
|
file_offset >>= inode->i_sb->s_blocksize_bits;
|
|
|
|
io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-03-08 13:35:48 +00:00
|
|
|
static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
|
|
|
|
struct inode *inode, u64 file_offset, int async_submit)
|
2010-11-22 03:04:43 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2013-07-25 11:22:34 +00:00
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
2016-06-05 19:31:52 +00:00
|
|
|
bool write = bio_op(bio) == REQ_OP_WRITE;
|
2017-06-03 07:38:06 +00:00
|
|
|
blk_status_t ret;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2017-11-01 23:19:27 +00:00
|
|
|
/* Check btrfs_submit_bio_hook() for rules about async submit. */
|
2012-11-16 18:56:32 +00:00
|
|
|
if (async_submit)
|
|
|
|
async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
|
|
|
|
|
2012-05-02 18:00:54 +00:00
|
|
|
if (!write) {
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
|
2012-05-02 18:00:54 +00:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2017-08-03 12:44:58 +00:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
2011-04-06 18:41:34 +00:00
|
|
|
goto map;
|
|
|
|
|
|
|
|
if (write && async_submit) {
|
2017-05-05 15:57:13 +00:00
|
|
|
ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
|
|
|
|
file_offset, inode,
|
2018-07-18 15:36:24 +00:00
|
|
|
btrfs_submit_bio_start_direct_io);
|
2010-11-22 03:04:43 +00:00
|
|
|
goto err;
|
2011-04-06 18:41:34 +00:00
|
|
|
} else if (write) {
|
|
|
|
/*
|
|
|
|
* If we aren't doing async submit, calculate the csum of the
|
|
|
|
* bio now.
|
|
|
|
*/
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
|
2011-04-06 18:41:34 +00:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
2014-09-12 10:43:54 +00:00
|
|
|
} else {
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
|
2014-09-12 10:43:56 +00:00
|
|
|
file_offset);
|
2011-03-01 06:48:31 +00:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2011-04-06 18:41:34 +00:00
|
|
|
map:
|
2017-08-18 17:54:02 +00:00
|
|
|
ret = btrfs_map_bio(fs_info, bio, 0, 0);
|
2010-11-22 03:04:43 +00:00
|
|
|
err:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-08-03 12:44:58 +00:00
|
|
|
static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
|
2010-11-22 03:04:43 +00:00
|
|
|
{
|
|
|
|
struct inode *inode = dip->inode;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2010-11-22 03:04:43 +00:00
|
|
|
struct bio *bio;
|
|
|
|
struct bio *orig_bio = dip->orig_bio;
|
2013-10-11 22:44:27 +00:00
|
|
|
u64 start_sector = orig_bio->bi_iter.bi_sector;
|
2010-11-22 03:04:43 +00:00
|
|
|
u64 file_offset = dip->logical_offset;
|
|
|
|
u64 map_length;
|
2011-04-06 18:41:34 +00:00
|
|
|
int async_submit = 0;
|
2017-05-16 16:51:39 +00:00
|
|
|
u64 submit_len;
|
|
|
|
int clone_offset = 0;
|
|
|
|
int clone_len;
|
2016-01-21 10:26:00 +00:00
|
|
|
int ret;
|
2017-08-23 06:45:59 +00:00
|
|
|
blk_status_t status;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2013-10-11 22:44:27 +00:00
|
|
|
map_length = orig_bio->bi_iter.bi_size;
|
2017-05-16 16:51:39 +00:00
|
|
|
submit_len = map_length;
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
|
|
|
|
&map_length, NULL, 0);
|
2014-06-17 10:58:59 +00:00
|
|
|
if (ret)
|
2010-11-22 03:04:43 +00:00
|
|
|
return -EIO;
|
2013-07-25 11:22:34 +00:00
|
|
|
|
2017-05-16 16:51:39 +00:00
|
|
|
if (map_length >= submit_len) {
|
2011-04-06 18:25:44 +00:00
|
|
|
bio = orig_bio;
|
2014-09-12 10:43:56 +00:00
|
|
|
dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
|
2011-04-06 18:25:44 +00:00
|
|
|
goto submit;
|
|
|
|
}
|
|
|
|
|
2013-01-29 23:40:14 +00:00
|
|
|
/* async crcs make it difficult to collect full stripe writes. */
|
2017-05-17 15:38:35 +00:00
|
|
|
if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
2013-01-29 23:40:14 +00:00
|
|
|
async_submit = 0;
|
|
|
|
else
|
|
|
|
async_submit = 1;
|
|
|
|
|
2017-05-16 16:51:39 +00:00
|
|
|
/* bio split */
|
|
|
|
ASSERT(map_length <= INT_MAX);
|
2011-04-06 18:25:44 +00:00
|
|
|
atomic_inc(&dip->pending_bios);
|
2017-05-18 13:33:29 +00:00
|
|
|
do {
|
2017-05-16 16:51:39 +00:00
|
|
|
clone_len = min_t(int, submit_len, map_length);
|
2011-04-06 18:25:44 +00:00
|
|
|
|
2017-05-16 16:51:39 +00:00
|
|
|
/*
|
|
|
|
* This will never fail as it's passing GPF_NOFS and
|
|
|
|
* the allocation is backed by btrfs_bioset.
|
|
|
|
*/
|
2017-05-16 17:57:14 +00:00
|
|
|
bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
|
2017-05-16 16:51:39 +00:00
|
|
|
clone_len);
|
|
|
|
bio->bi_private = dip;
|
|
|
|
bio->bi_end_io = btrfs_end_dio_bio;
|
|
|
|
btrfs_io_bio(bio)->logical = file_offset;
|
|
|
|
|
|
|
|
ASSERT(submit_len >= clone_len);
|
|
|
|
submit_len -= clone_len;
|
|
|
|
if (submit_len == 0)
|
|
|
|
break;
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2017-05-16 16:51:39 +00:00
|
|
|
/*
|
|
|
|
* Increase the count before we submit the bio so we know
|
|
|
|
* the end IO handler won't happen before we increase the
|
|
|
|
* count. Otherwise, the dip might get freed before we're
|
|
|
|
* done setting it up.
|
|
|
|
*/
|
|
|
|
atomic_inc(&dip->pending_bios);
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2018-03-08 13:35:48 +00:00
|
|
|
status = btrfs_submit_dio_bio(bio, inode, file_offset,
|
2017-08-23 06:45:59 +00:00
|
|
|
async_submit);
|
|
|
|
if (status) {
|
2017-05-16 16:51:39 +00:00
|
|
|
bio_put(bio);
|
|
|
|
atomic_dec(&dip->pending_bios);
|
|
|
|
goto out_err;
|
|
|
|
}
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2017-05-16 16:51:39 +00:00
|
|
|
clone_offset += clone_len;
|
|
|
|
start_sector += clone_len >> 9;
|
|
|
|
file_offset += clone_len;
|
2016-01-21 10:26:00 +00:00
|
|
|
|
2017-05-16 16:51:39 +00:00
|
|
|
map_length = submit_len;
|
|
|
|
ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
|
|
|
|
start_sector << 9, &map_length, NULL, 0);
|
|
|
|
if (ret)
|
|
|
|
goto out_err;
|
2017-05-18 13:33:29 +00:00
|
|
|
} while (submit_len > 0);
|
2010-11-22 03:04:43 +00:00
|
|
|
|
2011-04-06 18:25:44 +00:00
|
|
|
submit:
|
2018-03-08 13:35:48 +00:00
|
|
|
status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
|
2017-08-23 06:45:59 +00:00
|
|
|
if (!status)
|
2010-11-22 03:04:43 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
bio_put(bio);
|
|
|
|
out_err:
|
|
|
|
dip->errors = 1;
|
|
|
|
/*
|
2018-02-14 08:53:36 +00:00
|
|
|
* Before atomic variable goto zero, we must make sure dip->errors is
|
|
|
|
* perceived to be set. This ordering is ensured by the fact that an
|
|
|
|
* atomic operations with a return value are fully ordered as per
|
|
|
|
* atomic_t.txt
|
2010-11-22 03:04:43 +00:00
|
|
|
*/
|
|
|
|
if (atomic_dec_and_test(&dip->pending_bios))
|
|
|
|
bio_io_error(dip->orig_bio);
|
|
|
|
|
|
|
|
/* bio_end_io() will handle error, so we needn't return it */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-06-05 19:31:50 +00:00
|
|
|
static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
|
|
|
|
loff_t file_offset)
|
2010-05-23 15:00:55 +00:00
|
|
|
{
|
2015-07-01 11:13:10 +00:00
|
|
|
struct btrfs_dio_private *dip = NULL;
|
2017-04-17 22:00:28 +00:00
|
|
|
struct bio *bio = NULL;
|
|
|
|
struct btrfs_io_bio *io_bio;
|
2016-06-05 19:31:50 +00:00
|
|
|
bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
|
2010-05-23 15:00:55 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
2017-06-02 15:48:13 +00:00
|
|
|
bio = btrfs_bio_clone(dio_bio);
|
2013-05-17 22:30:14 +00:00
|
|
|
|
2014-09-12 10:43:56 +00:00
|
|
|
dip = kzalloc(sizeof(*dip), GFP_NOFS);
|
2010-05-23 15:00:55 +00:00
|
|
|
if (!dip) {
|
|
|
|
ret = -ENOMEM;
|
2015-07-01 11:13:10 +00:00
|
|
|
goto free_ordered;
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
2013-05-17 22:30:14 +00:00
|
|
|
dip->private = dio_bio->bi_private;
|
2010-05-23 15:00:55 +00:00
|
|
|
dip->inode = inode;
|
|
|
|
dip->logical_offset = file_offset;
|
2013-10-11 22:44:27 +00:00
|
|
|
dip->bytes = dio_bio->bi_iter.bi_size;
|
|
|
|
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
|
2017-04-17 22:00:28 +00:00
|
|
|
bio->bi_private = dip;
|
|
|
|
dip->orig_bio = bio;
|
2013-05-17 22:30:14 +00:00
|
|
|
dip->dio_bio = dio_bio;
|
2010-11-22 03:04:43 +00:00
|
|
|
atomic_set(&dip->pending_bios, 0);
|
2017-04-17 22:00:28 +00:00
|
|
|
io_bio = btrfs_io_bio(bio);
|
|
|
|
io_bio->logical = file_offset;
|
2010-05-23 15:00:55 +00:00
|
|
|
|
2014-09-12 10:43:56 +00:00
|
|
|
if (write) {
|
2017-04-17 22:00:28 +00:00
|
|
|
bio->bi_end_io = btrfs_endio_direct_write;
|
2014-09-12 10:43:56 +00:00
|
|
|
} else {
|
2017-04-17 22:00:28 +00:00
|
|
|
bio->bi_end_io = btrfs_endio_direct_read;
|
2014-09-12 10:43:56 +00:00
|
|
|
dip->subio_endio = btrfs_subio_endio_read;
|
|
|
|
}
|
2010-05-23 15:00:55 +00:00
|
|
|
|
2015-12-08 19:23:20 +00:00
|
|
|
/*
|
|
|
|
* Reset the range for unsubmitted ordered extents (to a 0 length range)
|
|
|
|
* even if we fail to submit a bio, because in such case we do the
|
|
|
|
* corresponding error handling below and it must not be done a second
|
|
|
|
* time by btrfs_direct_IO().
|
|
|
|
*/
|
|
|
|
if (write) {
|
|
|
|
struct btrfs_dio_data *dio_data = current->journal_info;
|
|
|
|
|
|
|
|
dio_data->unsubmitted_oe_range_end = dip->logical_offset +
|
|
|
|
dip->bytes;
|
|
|
|
dio_data->unsubmitted_oe_range_start =
|
|
|
|
dio_data->unsubmitted_oe_range_end;
|
|
|
|
}
|
|
|
|
|
2017-08-03 12:44:58 +00:00
|
|
|
ret = btrfs_submit_direct_hook(dip);
|
2010-11-22 03:04:43 +00:00
|
|
|
if (!ret)
|
2010-05-25 13:48:28 +00:00
|
|
|
return;
|
2013-05-17 22:30:14 +00:00
|
|
|
|
2017-04-17 22:00:28 +00:00
|
|
|
if (io_bio->end_io)
|
|
|
|
io_bio->end_io(io_bio, ret);
|
2013-05-17 22:30:14 +00:00
|
|
|
|
2010-05-23 15:00:55 +00:00
|
|
|
free_ordered:
|
|
|
|
/*
|
2015-07-01 11:13:10 +00:00
|
|
|
* If we arrived here it means either we failed to submit the dip
|
|
|
|
* or we either failed to clone the dio_bio or failed to allocate the
|
|
|
|
* dip. If we cloned the dio_bio and allocated the dip, we can just
|
|
|
|
* call bio_endio against our io_bio so that we get proper resource
|
|
|
|
* cleanup if we fail to submit the dip, otherwise, we must do the
|
|
|
|
* same as btrfs_endio_direct_[write|read] because we can't call these
|
|
|
|
* callbacks - they require an allocated dip and a clone of dio_bio.
|
2010-05-23 15:00:55 +00:00
|
|
|
*/
|
2017-04-17 22:00:28 +00:00
|
|
|
if (bio && dip) {
|
2017-06-02 08:08:50 +00:00
|
|
|
bio_io_error(bio);
|
2015-07-01 11:13:10 +00:00
|
|
|
/*
|
2017-04-17 22:00:28 +00:00
|
|
|
* The end io callbacks free our dip, do the final put on bio
|
2015-07-01 11:13:10 +00:00
|
|
|
* and all the cleanup and final put for dio_bio (through
|
|
|
|
* dio_end_io()).
|
|
|
|
*/
|
|
|
|
dip = NULL;
|
2017-04-17 22:00:28 +00:00
|
|
|
bio = NULL;
|
2015-07-01 11:13:10 +00:00
|
|
|
} else {
|
2015-11-24 16:23:54 +00:00
|
|
|
if (write)
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
__endio_write_update_ordered(inode,
|
2015-11-24 16:23:54 +00:00
|
|
|
file_offset,
|
|
|
|
dio_bio->bi_iter.bi_size,
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
false);
|
2015-11-24 16:23:54 +00:00
|
|
|
else
|
2015-07-01 11:13:10 +00:00
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
|
|
|
|
file_offset + dio_bio->bi_iter.bi_size - 1);
|
2015-11-24 16:23:54 +00:00
|
|
|
|
2017-06-03 07:38:06 +00:00
|
|
|
dio_bio->bi_status = BLK_STS_IOERR;
|
2015-07-01 11:13:10 +00:00
|
|
|
/*
|
|
|
|
* Releases and cleans up our dio_bio, no need to bio_put()
|
|
|
|
* nor bio_endio()/bio_io_error() against dio_bio.
|
|
|
|
*/
|
2017-06-03 07:37:58 +00:00
|
|
|
dio_end_io(dio_bio);
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
2017-04-17 22:00:28 +00:00
|
|
|
if (bio)
|
|
|
|
bio_put(bio);
|
2015-07-01 11:13:10 +00:00
|
|
|
kfree(dip);
|
2010-05-23 15:00:55 +00:00
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
|
|
|
|
const struct iov_iter *iter, loff_t offset)
|
2010-05-27 01:33:37 +00:00
|
|
|
{
|
|
|
|
int seg;
|
2011-04-08 15:51:18 +00:00
|
|
|
int i;
|
2016-06-22 22:54:23 +00:00
|
|
|
unsigned int blocksize_mask = fs_info->sectorsize - 1;
|
2010-05-27 01:33:37 +00:00
|
|
|
ssize_t retval = -EINVAL;
|
|
|
|
|
|
|
|
if (offset & blocksize_mask)
|
|
|
|
goto out;
|
|
|
|
|
2014-03-22 09:15:17 +00:00
|
|
|
if (iov_iter_alignment(iter) & blocksize_mask)
|
|
|
|
goto out;
|
2011-04-08 15:51:18 +00:00
|
|
|
|
2014-03-22 09:15:17 +00:00
|
|
|
/* If this is a write we don't need to check anymore */
|
2016-10-10 17:39:05 +00:00
|
|
|
if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
|
2014-03-22 09:15:17 +00:00
|
|
|
return 0;
|
|
|
|
/*
|
|
|
|
* Check to make sure we don't have duplicate iov_base's in this
|
|
|
|
* iovec, if so return EINVAL, otherwise we'll get csum errors
|
|
|
|
* when reading back.
|
|
|
|
*/
|
|
|
|
for (seg = 0; seg < iter->nr_segs; seg++) {
|
|
|
|
for (i = seg + 1; i < iter->nr_segs; i++) {
|
|
|
|
if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
|
2011-04-08 15:51:18 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2010-05-27 01:33:37 +00:00
|
|
|
}
|
|
|
|
retval = 0;
|
|
|
|
out:
|
|
|
|
return retval;
|
|
|
|
}
|
2012-07-31 20:28:48 +00:00
|
|
|
|
2016-04-07 15:51:58 +00:00
|
|
|
static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
2008-04-10 14:23:21 +00:00
|
|
|
{
|
2010-05-23 15:00:55 +00:00
|
|
|
struct file *file = iocb->ki_filp;
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2015-08-28 15:40:13 +00:00
|
|
|
struct btrfs_dio_data dio_data = { 0 };
|
2017-02-27 07:10:38 +00:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2016-04-07 15:51:58 +00:00
|
|
|
loff_t offset = iocb->ki_pos;
|
2013-02-07 10:12:07 +00:00
|
|
|
size_t count = 0;
|
2013-02-08 07:01:08 +00:00
|
|
|
int flags = 0;
|
2013-02-08 07:04:11 +00:00
|
|
|
bool wakeup = true;
|
|
|
|
bool relock = false;
|
2013-02-07 10:12:07 +00:00
|
|
|
ssize_t ret;
|
2010-05-23 15:00:55 +00:00
|
|
|
|
2017-08-21 09:43:46 +00:00
|
|
|
if (check_direct_IO(fs_info, iter, offset))
|
2010-05-27 01:33:37 +00:00
|
|
|
return 0;
|
2010-05-26 14:59:53 +00:00
|
|
|
|
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.
For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:
clat percentiles (usec):
| 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34],
| 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35],
| 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80],
| 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155],
| 99.99th=[ 165]
After:
clat percentiles (usec):
| 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149],
| 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171],
| 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270],
| 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422],
| 99.99th=[ 438]
In other setups, Robert Elliott reported seeing good performance
improvements:
https://lkml.org/lkml/2015/4/3/557
The more applications accessing the device, the worse it gets.
Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-04-15 23:05:48 +00:00
|
|
|
inode_dio_begin(inode);
|
2013-02-08 07:04:11 +00:00
|
|
|
|
2013-07-02 14:38:02 +00:00
|
|
|
/*
|
Btrfs: just do dirty page flush for the inode with compression before direct IO
As the comment in the btrfs_direct_IO says, only the compressed pages need be
flush again to make sure they are on the disk, but the common pages needn't,
so we add a if statement to check if the inode has compressed pages or not,
if no, skip the flush.
And in order to prevent the write ranges from intersecting, we need wait for
the running ordered extents. But the current code waits for them twice, one
is done before the direct IO starts (in btrfs_wait_ordered_range()), the other
is before we get the blocks, it is unnecessary. because we can do the direct
IO without holding i_mutex, it means that the intersected ordered extents may
happen during the direct IO, the first wait can not avoid this problem. So we
use filemap_fdatawrite_range() instead of btrfs_wait_ordered_range() to remove
the first wait.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-03-06 05:54:57 +00:00
|
|
|
* The generic stuff only does filemap_write_and_wait_range, which
|
|
|
|
* isn't enough if we've written compressed pages to this area, so
|
|
|
|
* we need to flush the dirty pages again to make absolutely sure
|
|
|
|
* that any outstanding dirty pages are on disk.
|
2013-07-02 14:38:02 +00:00
|
|
|
*/
|
2014-03-05 03:38:00 +00:00
|
|
|
count = iov_iter_count(iter);
|
Btrfs: just do dirty page flush for the inode with compression before direct IO
As the comment in the btrfs_direct_IO says, only the compressed pages need be
flush again to make sure they are on the disk, but the common pages needn't,
so we add a if statement to check if the inode has compressed pages or not,
if no, skip the flush.
And in order to prevent the write ranges from intersecting, we need wait for
the running ordered extents. But the current code waits for them twice, one
is done before the direct IO starts (in btrfs_wait_ordered_range()), the other
is before we get the blocks, it is unnecessary. because we can do the direct
IO without holding i_mutex, it means that the intersected ordered extents may
happen during the direct IO, the first wait can not avoid this problem. So we
use filemap_fdatawrite_range() instead of btrfs_wait_ordered_range() to remove
the first wait.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-03-06 05:54:57 +00:00
|
|
|
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2014-07-17 03:44:13 +00:00
|
|
|
filemap_fdatawrite_range(inode->i_mapping, offset,
|
|
|
|
offset + count - 1);
|
2013-07-02 14:38:02 +00:00
|
|
|
|
2015-03-16 11:33:52 +00:00
|
|
|
if (iov_iter_rw(iter) == WRITE) {
|
2013-02-08 07:04:11 +00:00
|
|
|
/*
|
|
|
|
* If the write DIO is beyond the EOF, we need update
|
|
|
|
* the isize, but it is protected by i_mutex. So we can
|
|
|
|
* not unlock the i_mutex at this case.
|
|
|
|
*/
|
|
|
|
if (offset + count <= inode->i_size) {
|
2016-12-15 06:36:05 +00:00
|
|
|
dio_data.overwrite = 1;
|
2016-01-22 20:40:57 +00:00
|
|
|
inode_unlock(inode);
|
2013-02-08 07:04:11 +00:00
|
|
|
relock = true;
|
2017-06-20 12:05:49 +00:00
|
|
|
} else if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto out;
|
2013-02-08 07:04:11 +00:00
|
|
|
}
|
2017-02-27 07:10:38 +00:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
|
|
|
|
offset, count);
|
2013-02-07 10:12:07 +00:00
|
|
|
if (ret)
|
2013-02-08 07:04:11 +00:00
|
|
|
goto out;
|
2015-03-17 14:52:28 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to know how many extents we reserved so that we can
|
|
|
|
* do the accounting properly if we go over the number we
|
|
|
|
* originally calculated. Abuse current->journal_info for this.
|
|
|
|
*/
|
2016-06-15 13:22:56 +00:00
|
|
|
dio_data.reserve = round_up(count,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize);
|
2015-12-08 19:23:20 +00:00
|
|
|
dio_data.unsubmitted_oe_range_start = (u64)offset;
|
|
|
|
dio_data.unsubmitted_oe_range_end = (u64)offset;
|
2015-08-28 15:40:13 +00:00
|
|
|
current->journal_info = &dio_data;
|
2016-12-23 09:30:18 +00:00
|
|
|
down_read(&BTRFS_I(inode)->dio_sem);
|
2014-09-29 23:33:33 +00:00
|
|
|
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
|
|
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.
For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:
clat percentiles (usec):
| 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34],
| 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35],
| 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80],
| 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155],
| 99.99th=[ 165]
After:
clat percentiles (usec):
| 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149],
| 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171],
| 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270],
| 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422],
| 99.99th=[ 438]
In other setups, Robert Elliott reported seeing good performance
improvements:
https://lkml.org/lkml/2015/4/3/557
The more applications accessing the device, the worse it gets.
Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-04-15 23:05:48 +00:00
|
|
|
inode_dio_end(inode);
|
2013-02-08 07:04:11 +00:00
|
|
|
flags = DIO_LOCKING | DIO_SKIP_HOLES;
|
|
|
|
wakeup = false;
|
2013-02-07 10:12:07 +00:00
|
|
|
}
|
|
|
|
|
2015-03-16 11:33:50 +00:00
|
|
|
ret = __blockdev_direct_IO(iocb, inode,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->fs_devices->latest_bdev,
|
2016-04-07 15:51:58 +00:00
|
|
|
iter, btrfs_get_blocks_direct, NULL,
|
2015-03-16 11:33:50 +00:00
|
|
|
btrfs_submit_direct, flags);
|
2015-03-16 11:33:52 +00:00
|
|
|
if (iov_iter_rw(iter) == WRITE) {
|
2016-12-23 09:30:18 +00:00
|
|
|
up_read(&BTRFS_I(inode)->dio_sem);
|
2015-03-17 14:52:28 +00:00
|
|
|
current->journal_info = NULL;
|
2015-06-17 08:59:58 +00:00
|
|
|
if (ret < 0 && ret != -EIOCBQUEUED) {
|
2015-08-28 15:40:13 +00:00
|
|
|
if (dio_data.reserve)
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
offset, dio_data.reserve, true);
|
2015-12-08 19:23:20 +00:00
|
|
|
/*
|
|
|
|
* On error we might have left some ordered extents
|
|
|
|
* without submitting corresponding bios for them, so
|
|
|
|
* cleanup them up to avoid other tasks getting them
|
|
|
|
* and waiting for them to complete forever.
|
|
|
|
*/
|
|
|
|
if (dio_data.unsubmitted_oe_range_start <
|
|
|
|
dio_data.unsubmitted_oe_range_end)
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
__endio_write_update_ordered(inode,
|
2015-12-08 19:23:20 +00:00
|
|
|
dio_data.unsubmitted_oe_range_start,
|
|
|
|
dio_data.unsubmitted_oe_range_end -
|
|
|
|
dio_data.unsubmitted_oe_range_start,
|
btrfs: Handle delalloc error correctly to avoid ordered extent hang
[BUG]
If run_delalloc_range() returns error and there is already some ordered
extents created, btrfs will be hanged with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
[CAUSE]
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>| |<---------- cleanup range --------->|
||
\_=> First page handled by end_extent_writepage() in __extent_writepage()
The problem is caused by error handler of run_delalloc_range(), which
doesn't handle any created ordered extents, leaving them waiting on
btrfs_finish_ordered_io() to finish.
However after run_delalloc_range() returns error, __extent_writepage()
won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered
except the first page, and btrfs_finish_ordered_io() won't be triggered
for created ordered extents either.
So OE 2~n will hang forever, and if OE 1 is larger than one page, it
will also hang.
[FIX]
Introduce btrfs_cleanup_ordered_extents() function to cleanup created
ordered extents and finish them manually.
The function is based on existing
btrfs_endio_direct_write_update_ordered() function, and modify it to
act just like btrfs_writepage_endio_hook() but handles specified range
other than one page.
After fix, delalloc error will be handled like:
|<------------------ delalloc range --------------------------->|
| OE 1 | OE 2 | ... | OE n |
|<>|<-------- ----------->|<------ old error handler --------->|
|| ||
|| \_=> Cleaned up by cleanup_ordered_extents()
\_=> First page handled by end_extent_writepage() in __extent_writepage()
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-03-08 02:25:52 +00:00
|
|
|
false);
|
2015-06-17 08:59:58 +00:00
|
|
|
} else if (ret >= 0 && (size_t)ret < count)
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
offset, count - (size_t)ret, true);
|
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
|
2013-02-07 10:12:07 +00:00
|
|
|
}
|
2013-02-08 07:04:11 +00:00
|
|
|
out:
|
2013-02-08 07:01:08 +00:00
|
|
|
if (wakeup)
|
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.
For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:
clat percentiles (usec):
| 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34],
| 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35],
| 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80],
| 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155],
| 99.99th=[ 165]
After:
clat percentiles (usec):
| 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149],
| 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171],
| 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270],
| 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422],
| 99.99th=[ 438]
In other setups, Robert Elliott reported seeing good performance
improvements:
https://lkml.org/lkml/2015/4/3/557
The more applications accessing the device, the worse it gets.
Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-04-15 23:05:48 +00:00
|
|
|
inode_dio_end(inode);
|
2013-02-08 07:04:11 +00:00
|
|
|
if (relock)
|
2016-01-22 20:40:57 +00:00
|
|
|
inode_lock(inode);
|
2013-02-07 10:12:07 +00:00
|
|
|
|
2017-02-27 07:10:38 +00:00
|
|
|
extent_changeset_free(data_reserved);
|
2013-02-07 10:12:07 +00:00
|
|
|
return ret;
|
2008-04-10 14:23:21 +00:00
|
|
|
}
|
|
|
|
|
2012-11-29 05:08:26 +00:00
|
|
|
#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
|
|
|
|
|
2009-01-21 19:39:14 +00:00
|
|
|
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
|
|
|
__u64 start, __u64 len)
|
|
|
|
{
|
2012-11-29 05:08:26 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2017-06-23 02:09:57 +00:00
|
|
|
return extent_fiemap(inode, fieinfo, start, len);
|
2009-01-21 19:39:14 +00:00
|
|
|
}
|
|
|
|
|
2007-08-27 20:49:44 +00:00
|
|
|
int btrfs_readpage(struct file *file, struct page *page)
|
2007-06-15 17:50:00 +00:00
|
|
|
{
|
2008-01-24 21:13:08 +00:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
2011-06-13 18:02:58 +00:00
|
|
|
return extent_read_full_page(tree, page, btrfs_get_extent, 0);
|
2007-06-15 17:50:00 +00:00
|
|
|
}
|
2007-12-21 21:27:21 +00:00
|
|
|
|
2007-08-27 20:49:44 +00:00
|
|
|
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2015-10-22 19:05:09 +00:00
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
int ret;
|
2007-08-27 20:49:44 +00:00
|
|
|
|
|
|
|
if (current->flags & PF_MEMALLOC) {
|
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
|
|
unlock_page(page);
|
|
|
|
return 0;
|
|
|
|
}
|
2015-10-22 19:05:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are under memory pressure we will call this directly from the
|
|
|
|
* VM, we need to make sure we have the inode referenced for the ordered
|
|
|
|
* extent. If not just return like we didn't do anything.
|
|
|
|
*/
|
|
|
|
if (!igrab(inode)) {
|
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
|
|
return AOP_WRITEPAGE_ACTIVATE;
|
|
|
|
}
|
2017-12-08 13:55:59 +00:00
|
|
|
ret = extent_write_full_page(page, wbc);
|
2015-10-22 19:05:09 +00:00
|
|
|
btrfs_add_delayed_iput(inode);
|
|
|
|
return ret;
|
2007-06-15 17:50:00 +00:00
|
|
|
}
|
|
|
|
|
2013-04-25 20:41:01 +00:00
|
|
|
static int btrfs_writepages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc)
|
2007-11-01 23:45:34 +00:00
|
|
|
{
|
2018-04-19 07:46:38 +00:00
|
|
|
return extent_writepages(mapping, wbc);
|
2007-11-01 23:45:34 +00:00
|
|
|
}
|
|
|
|
|
2007-11-08 15:59:22 +00:00
|
|
|
static int
|
|
|
|
btrfs_readpages(struct file *file, struct address_space *mapping,
|
|
|
|
struct list_head *pages, unsigned nr_pages)
|
|
|
|
{
|
2018-04-19 07:46:36 +00:00
|
|
|
return extent_readpages(mapping, pages, nr_pages);
|
2007-11-08 15:59:22 +00:00
|
|
|
}
|
2018-04-19 07:46:36 +00:00
|
|
|
|
2008-07-17 16:53:50 +00:00
|
|
|
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
2007-06-15 17:50:00 +00:00
|
|
|
{
|
2018-04-19 07:46:34 +00:00
|
|
|
int ret = try_release_extent_mapping(page, gfp_flags);
|
2007-08-27 20:49:44 +00:00
|
|
|
if (ret == 1) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2007-08-27 20:49:44 +00:00
|
|
|
return ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2008-07-17 16:53:50 +00:00
|
|
|
static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
|
|
|
{
|
2008-09-11 19:51:43 +00:00
|
|
|
if (PageWriteback(page) || PageDirty(page))
|
|
|
|
return 0;
|
2017-01-09 14:39:02 +00:00
|
|
|
return __btrfs_releasepage(page, gfp_flags);
|
2008-07-17 16:53:50 +00:00
|
|
|
}
|
|
|
|
|
2013-05-22 03:17:23 +00:00
|
|
|
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
|
|
|
|
unsigned int length)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2012-05-02 18:00:54 +00:00
|
|
|
struct inode *inode = page->mapping->host;
|
2008-01-24 21:13:08 +00:00
|
|
|
struct extent_io_tree *tree;
|
2008-07-17 16:53:50 +00:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-17 16:53:50 +00:00
|
|
|
u64 page_start = page_offset(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
u64 page_end = page_start + PAGE_SIZE - 1;
|
2016-01-21 10:25:58 +00:00
|
|
|
u64 start;
|
|
|
|
u64 end;
|
2013-11-19 22:29:35 +00:00
|
|
|
int inode_evicting = inode->i_state & I_FREEING;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-09-02 20:53:46 +00:00
|
|
|
/*
|
|
|
|
* we have the page locked, so new writeback can't start,
|
|
|
|
* and the dirty bit won't be cleared while we are here.
|
|
|
|
*
|
|
|
|
* Wait for IO on this page so that we can safely clear
|
|
|
|
* the PagePrivate2 bit and do ordered accounting
|
|
|
|
*/
|
2008-07-17 16:53:50 +00:00
|
|
|
wait_on_page_writeback(page);
|
2009-09-02 20:53:46 +00:00
|
|
|
|
2012-05-02 18:00:54 +00:00
|
|
|
tree = &BTRFS_I(inode)->io_tree;
|
2008-07-17 16:53:50 +00:00
|
|
|
if (offset) {
|
|
|
|
btrfs_releasepage(page, GFP_NOFS);
|
|
|
|
return;
|
|
|
|
}
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
if (!inode_evicting)
|
2015-12-03 13:30:40 +00:00
|
|
|
lock_extent_bits(tree, page_start, page_end, &cached_state);
|
2016-01-21 10:25:58 +00:00
|
|
|
again:
|
|
|
|
start = page_start;
|
2017-02-20 11:50:49 +00:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
|
2016-01-21 10:25:58 +00:00
|
|
|
page_end - start + 1);
|
2008-07-17 16:53:50 +00:00
|
|
|
if (ordered) {
|
2016-01-21 10:25:58 +00:00
|
|
|
end = min(page_end, ordered->file_offset + ordered->len - 1);
|
2008-07-17 17:53:27 +00:00
|
|
|
/*
|
|
|
|
* IO on this page will never be started, so we need
|
|
|
|
* to account for any ordered extents now
|
|
|
|
*/
|
2013-11-19 22:29:35 +00:00
|
|
|
if (!inode_evicting)
|
2016-01-21 10:25:58 +00:00
|
|
|
clear_extent_bit(tree, start, end,
|
2013-11-19 22:29:35 +00:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
EXTENT_DELALLOC_NEW |
|
2013-11-19 22:29:35 +00:00
|
|
|
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
|
2017-10-31 15:37:52 +00:00
|
|
|
EXTENT_DEFRAG, 1, 0, &cached_state);
|
2009-09-02 20:53:46 +00:00
|
|
|
/*
|
|
|
|
* whoever cleared the private bit is responsible
|
|
|
|
* for the finish_ordered_io
|
|
|
|
*/
|
2013-08-29 17:57:21 +00:00
|
|
|
if (TestClearPagePrivate2(page)) {
|
|
|
|
struct btrfs_ordered_inode_tree *tree;
|
|
|
|
u64 new_len;
|
|
|
|
|
|
|
|
tree = &BTRFS_I(inode)->ordered_tree;
|
|
|
|
|
|
|
|
spin_lock_irq(&tree->lock);
|
|
|
|
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
|
2016-01-21 10:25:58 +00:00
|
|
|
new_len = start - ordered->file_offset;
|
2013-08-29 17:57:21 +00:00
|
|
|
if (new_len < ordered->truncated_len)
|
|
|
|
ordered->truncated_len = new_len;
|
|
|
|
spin_unlock_irq(&tree->lock);
|
|
|
|
|
|
|
|
if (btrfs_dec_test_ordered_pending(inode, &ordered,
|
2016-01-21 10:25:58 +00:00
|
|
|
start,
|
|
|
|
end - start + 1, 1))
|
2013-08-29 17:57:21 +00:00
|
|
|
btrfs_finish_ordered_io(ordered);
|
2009-09-02 20:53:46 +00:00
|
|
|
}
|
2008-07-17 16:53:50 +00:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
2013-11-19 22:29:35 +00:00
|
|
|
if (!inode_evicting) {
|
|
|
|
cached_state = NULL;
|
2016-01-21 10:25:58 +00:00
|
|
|
lock_extent_bits(tree, start, end,
|
2013-11-19 22:29:35 +00:00
|
|
|
&cached_state);
|
|
|
|
}
|
2016-01-21 10:25:58 +00:00
|
|
|
|
|
|
|
start = end + 1;
|
|
|
|
if (start < page_end)
|
|
|
|
goto again;
|
2013-11-19 22:29:35 +00:00
|
|
|
}
|
|
|
|
|
2015-09-29 02:35:16 +00:00
|
|
|
/*
|
|
|
|
* Qgroup reserved space handler
|
|
|
|
* Page here will be either
|
|
|
|
* 1) Already written to disk
|
|
|
|
* In this case, its reserved space is released from data rsv map
|
|
|
|
* and will be freed by delayed_ref handler finally.
|
|
|
|
* So even we call qgroup_free_data(), it won't decrease reserved
|
|
|
|
* space.
|
|
|
|
* 2) Not written to disk
|
2016-09-30 15:40:52 +00:00
|
|
|
* This means the reserved space should be freed here. However,
|
|
|
|
* if a truncate invalidates the page (by clearing PageDirty)
|
|
|
|
* and the page is accounted for while allocating extent
|
|
|
|
* in btrfs_check_data_free_space() we let delayed_ref to
|
|
|
|
* free the entire extent.
|
2015-09-29 02:35:16 +00:00
|
|
|
*/
|
2016-09-30 15:40:52 +00:00
|
|
|
if (PageDirty(page))
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
|
2013-11-19 22:29:35 +00:00
|
|
|
if (!inode_evicting) {
|
|
|
|
clear_extent_bit(tree, page_start, page_end,
|
|
|
|
EXTENT_LOCKED | EXTENT_DIRTY |
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
|
2017-10-31 15:37:52 +00:00
|
|
|
&cached_state);
|
2013-11-19 22:29:35 +00:00
|
|
|
|
|
|
|
__btrfs_releasepage(page, GFP_NOFS);
|
2008-07-17 16:53:50 +00:00
|
|
|
}
|
|
|
|
|
2008-07-21 14:29:44 +00:00
|
|
|
ClearPageChecked(page);
|
2008-04-18 20:11:30 +00:00
|
|
|
if (PagePrivate(page)) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
put_page(page);
|
2008-04-18 20:11:30 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2007-06-15 17:50:00 +00:00
|
|
|
/*
|
|
|
|
* btrfs_page_mkwrite() is not allowed to change the file size as it gets
|
|
|
|
* called from a page fault handler when a page is first dirtied. Hence we must
|
|
|
|
* be careful to check for EOF conditions here. We set the page up correctly
|
|
|
|
* for a written page which means we get ENOSPC checking when writing into
|
|
|
|
* holes and correct delalloc and unwritten extent mapping on filesystems that
|
|
|
|
* support these features.
|
|
|
|
*
|
|
|
|
* We are not allowed to take the i_mutex here so we have to play games to
|
|
|
|
* protect against truncate races as the page could now be beyond EOF. Because
|
2018-05-11 20:13:29 +00:00
|
|
|
* truncate_setsize() writes the inode size before removing pages, once we have
|
|
|
|
* the page lock we can determine safely if the page is beyond EOF. If it is not
|
2007-06-15 17:50:00 +00:00
|
|
|
* beyond EOF, then the page is guaranteed safe against truncation until we
|
|
|
|
* unlock the page.
|
|
|
|
*/
|
2018-06-06 14:24:44 +00:00
|
|
|
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
|
2007-06-15 17:50:00 +00:00
|
|
|
{
|
2009-03-31 22:23:21 +00:00
|
|
|
struct page *page = vmf->page;
|
2017-02-24 22:56:41 +00:00
|
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-07-17 16:53:50 +00:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 19:33:23 +00:00
|
|
|
struct extent_state *cached_state = NULL;
|
2017-02-27 07:10:38 +00:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2008-07-17 16:53:50 +00:00
|
|
|
char *kaddr;
|
|
|
|
unsigned long zero_start;
|
2007-06-15 17:50:00 +00:00
|
|
|
loff_t size;
|
2018-06-06 14:24:44 +00:00
|
|
|
vm_fault_t ret;
|
|
|
|
int ret2;
|
2012-01-25 18:47:40 +00:00
|
|
|
int reserved = 0;
|
2016-01-21 10:25:57 +00:00
|
|
|
u64 reserved_space;
|
2007-08-27 20:49:44 +00:00
|
|
|
u64 page_start;
|
2008-07-17 16:53:50 +00:00
|
|
|
u64 page_end;
|
2016-01-21 10:25:57 +00:00
|
|
|
u64 end;
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
reserved_space = PAGE_SIZE;
|
2007-06-15 17:50:00 +00:00
|
|
|
|
2012-06-12 14:20:45 +00:00
|
|
|
sb_start_pagefault(inode->i_sb);
|
2015-09-08 09:25:54 +00:00
|
|
|
page_start = page_offset(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
page_end = page_start + PAGE_SIZE - 1;
|
2016-01-21 10:25:57 +00:00
|
|
|
end = page_end;
|
2015-09-08 09:25:54 +00:00
|
|
|
|
2016-01-21 10:25:57 +00:00
|
|
|
/*
|
|
|
|
* Reserving delalloc space after obtaining the page lock can lead to
|
|
|
|
* deadlock. For example, if a dirty page is locked by this function
|
|
|
|
* and the call to btrfs_delalloc_reserve_space() ends up triggering
|
|
|
|
* dirty page write out, then the btrfs_writepage() function could
|
|
|
|
* end up waiting indefinitely to get a lock on the page currently
|
|
|
|
* being processed by btrfs_page_mkwrite() function.
|
|
|
|
*/
|
2018-06-06 14:24:44 +00:00
|
|
|
ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
|
2016-01-21 10:25:57 +00:00
|
|
|
reserved_space);
|
2018-06-06 14:24:44 +00:00
|
|
|
if (!ret2) {
|
|
|
|
ret2 = file_update_time(vmf->vma->vm_file);
|
2012-01-25 18:47:40 +00:00
|
|
|
reserved = 1;
|
|
|
|
}
|
2018-06-06 14:24:44 +00:00
|
|
|
if (ret2) {
|
|
|
|
ret = vmf_error(ret2);
|
2012-01-25 18:47:40 +00:00
|
|
|
if (reserved)
|
|
|
|
goto out;
|
|
|
|
goto out_noreserve;
|
2009-03-31 22:23:23 +00:00
|
|
|
}
|
2007-12-21 21:27:21 +00:00
|
|
|
|
2009-03-31 22:23:23 +00:00
|
|
|
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
|
2008-07-17 16:53:50 +00:00
|
|
|
again:
|
2007-06-15 17:50:00 +00:00
|
|
|
lock_page(page);
|
|
|
|
size = i_size_read(inode);
|
2007-08-27 20:49:44 +00:00
|
|
|
|
2007-06-15 17:50:00 +00:00
|
|
|
if ((page->mapping != inode->i_mapping) ||
|
2008-07-17 16:53:50 +00:00
|
|
|
(page_start >= size)) {
|
2007-06-15 17:50:00 +00:00
|
|
|
/* page got truncated out from underneath us */
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2008-07-17 16:53:50 +00:00
|
|
|
wait_on_page_writeback(page);
|
|
|
|
|
2015-12-03 13:30:40 +00:00
|
|
|
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
|
2008-07-17 16:53:50 +00:00
|
|
|
set_page_extent_mapped(page);
|
|
|
|
|
2008-07-17 17:53:27 +00:00
|
|
|
/*
|
|
|
|
* we can't set the delalloc bits if there are pending ordered
|
|
|
|
* extents. Drop our locks and wait for them to finish
|
|
|
|
*/
|
2017-02-20 11:50:49 +00:00
|
|
|
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
|
|
|
|
PAGE_SIZE);
|
2008-07-17 16:53:50 +00:00
|
|
|
if (ordered) {
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached_state);
|
2008-07-17 16:53:50 +00:00
|
|
|
unlock_page(page);
|
2008-07-17 17:53:27 +00:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-07-17 16:53:50 +00:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
|
2016-06-15 13:22:56 +00:00
|
|
|
reserved_space = round_up(size - page_start,
|
2016-06-22 22:54:23 +00:00
|
|
|
fs_info->sectorsize);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
if (reserved_space < PAGE_SIZE) {
|
2016-01-21 10:25:57 +00:00
|
|
|
end = page_start + reserved_space - 1;
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
page_start, PAGE_SIZE - reserved_space,
|
|
|
|
true);
|
2016-01-21 10:25:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-10-01 21:10:23 +00:00
|
|
|
/*
|
2016-12-13 20:15:19 +00:00
|
|
|
* page_mkwrite gets called when the page is firstly dirtied after it's
|
|
|
|
* faulted in, but write(2) could also dirty a page and set delalloc
|
|
|
|
* bits, thus in this case for space account reason, we still need to
|
|
|
|
* clear any delalloc bits within this page range since we have to
|
|
|
|
* reserve data&meta space before lock_page() (see above comments).
|
2009-10-01 21:10:23 +00:00
|
|
|
*/
|
2016-01-21 10:25:57 +00:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
|
2012-09-06 01:10:51 +00:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
|
2017-10-31 15:37:52 +00:00
|
|
|
0, 0, &cached_state);
|
2009-10-01 21:10:23 +00:00
|
|
|
|
2018-06-06 14:24:44 +00:00
|
|
|
ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
|
2016-07-19 08:50:36 +00:00
|
|
|
&cached_state, 0);
|
2018-06-06 14:24:44 +00:00
|
|
|
if (ret2) {
|
2010-02-03 19:33:23 +00:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
2017-12-12 20:43:52 +00:00
|
|
|
&cached_state);
|
2009-09-11 20:12:44 +00:00
|
|
|
ret = VM_FAULT_SIGBUS;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2018-06-06 14:24:44 +00:00
|
|
|
ret2 = 0;
|
2007-06-15 17:50:00 +00:00
|
|
|
|
|
|
|
/* page is wholly or partially inside EOF */
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
if (page_start + PAGE_SIZE > size)
|
|
|
|
zero_start = size & ~PAGE_MASK;
|
2007-06-15 17:50:00 +00:00
|
|
|
else
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
zero_start = PAGE_SIZE;
|
2007-06-15 17:50:00 +00:00
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
if (zero_start != PAGE_SIZE) {
|
2008-07-17 16:53:50 +00:00
|
|
|
kaddr = kmap(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
|
2008-07-17 16:53:50 +00:00
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
2008-07-17 16:53:51 +00:00
|
|
|
ClearPageChecked(page);
|
2008-07-17 16:53:50 +00:00
|
|
|
set_page_dirty(page);
|
2009-09-11 16:33:12 +00:00
|
|
|
SetPageUptodate(page);
|
2009-03-31 17:27:11 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
BTRFS_I(inode)->last_trans = fs_info->generation;
|
2009-10-13 17:21:08 +00:00
|
|
|
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
|
2012-08-29 07:07:55 +00:00
|
|
|
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
|
2009-10-13 17:21:08 +00:00
|
|
|
|
2017-12-12 20:43:52 +00:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
|
2007-06-15 17:50:00 +00:00
|
|
|
|
2018-06-06 14:24:44 +00:00
|
|
|
if (!ret2) {
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
|
2012-06-12 14:20:45 +00:00
|
|
|
sb_end_pagefault(inode->i_sb);
|
2017-02-27 07:10:38 +00:00
|
|
|
extent_changeset_free(data_reserved);
|
2009-09-11 16:33:12 +00:00
|
|
|
return VM_FAULT_LOCKED;
|
2012-06-12 14:20:45 +00:00
|
|
|
}
|
2018-06-25 17:03:41 +00:00
|
|
|
|
|
|
|
out_unlock:
|
2007-06-15 17:50:00 +00:00
|
|
|
unlock_page(page);
|
2007-12-21 21:27:21 +00:00
|
|
|
out:
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved, page_start,
|
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 07:34:32 +00:00
|
|
|
reserved_space, (ret != 0));
|
2012-01-25 18:47:40 +00:00
|
|
|
out_noreserve:
|
2012-06-12 14:20:45 +00:00
|
|
|
sb_end_pagefault(inode->i_sb);
|
2017-02-27 07:10:38 +00:00
|
|
|
extent_changeset_free(data_reserved);
|
2007-06-15 17:50:00 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-02-06 20:40:31 +00:00
|
|
|
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-05-03 14:40:22 +00:00
|
|
|
struct btrfs_block_rsv *rsv;
|
2018-05-22 16:59:50 +00:00
|
|
|
int ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
2016-06-22 22:54:23 +00:00
|
|
|
u64 mask = fs_info->sectorsize - 1;
|
|
|
|
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2018-02-06 20:40:31 +00:00
|
|
|
if (!skip_writeback) {
|
|
|
|
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
|
|
|
|
(u64)-1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2011-05-03 14:40:22 +00:00
|
|
|
/*
|
2018-05-11 20:13:32 +00:00
|
|
|
* Yes ladies and gentlemen, this is indeed ugly. We have a couple of
|
|
|
|
* things going on here:
|
2011-05-03 14:40:22 +00:00
|
|
|
*
|
2018-05-11 20:13:32 +00:00
|
|
|
* 1) We need to reserve space to update our inode.
|
2011-05-03 14:40:22 +00:00
|
|
|
*
|
2018-05-11 20:13:32 +00:00
|
|
|
* 2) We need to have something to cache all the space that is going to
|
2011-05-03 14:40:22 +00:00
|
|
|
* be free'd up by the truncate operation, but also have some slack
|
|
|
|
* space reserved in case it uses space during the truncate (thank you
|
|
|
|
* very much snapshotting).
|
|
|
|
*
|
2018-05-11 20:13:32 +00:00
|
|
|
* And we need these to be separate. The fact is we can use a lot of
|
2011-05-03 14:40:22 +00:00
|
|
|
* space doing the truncate, and we have no earthly idea how much space
|
2016-05-20 01:18:45 +00:00
|
|
|
* we will use, so we need the truncate reservation to be separate so it
|
2018-05-11 20:13:32 +00:00
|
|
|
* doesn't end up using space reserved for updating the inode. We also
|
|
|
|
* need to be able to stop the transaction and start a new one, which
|
|
|
|
* means we need to be able to update the inode several times, and we
|
|
|
|
* have no idea of knowing how many times that will be, so we can't just
|
|
|
|
* reserve 1 item for the entirety of the operation, so that has to be
|
|
|
|
* done separately as well.
|
2011-05-03 14:40:22 +00:00
|
|
|
*
|
|
|
|
* So that leaves us with
|
|
|
|
*
|
2018-05-11 20:13:32 +00:00
|
|
|
* 1) rsv - for the truncate reservation, which we will steal from the
|
2011-05-03 14:40:22 +00:00
|
|
|
* transaction reservation.
|
2018-05-11 20:13:32 +00:00
|
|
|
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
|
2011-05-03 14:40:22 +00:00
|
|
|
* updating the inode.
|
|
|
|
*/
|
2016-06-22 22:54:24 +00:00
|
|
|
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
|
2011-05-03 14:40:22 +00:00
|
|
|
if (!rsv)
|
|
|
|
return -ENOMEM;
|
2011-08-29 15:01:31 +00:00
|
|
|
rsv->size = min_size;
|
2012-08-27 21:48:15 +00:00
|
|
|
rsv->failfast = 1;
|
2011-03-04 19:37:08 +00:00
|
|
|
|
2011-08-08 17:46:15 +00:00
|
|
|
/*
|
2011-08-19 14:29:59 +00:00
|
|
|
* 1 for the truncate slack space
|
2011-08-08 17:46:15 +00:00
|
|
|
* 1 for updating the inode.
|
|
|
|
*/
|
2013-01-07 22:03:21 +00:00
|
|
|
trans = btrfs_start_transaction(root, 2);
|
2011-05-03 14:40:22 +00:00
|
|
|
if (IS_ERR(trans)) {
|
2018-05-22 16:59:50 +00:00
|
|
|
ret = PTR_ERR(trans);
|
2011-05-03 14:40:22 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2011-03-04 19:37:08 +00:00
|
|
|
|
2011-08-08 17:46:15 +00:00
|
|
|
/* Migrate the slack space for the truncate to our reserve */
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
|
2016-03-25 17:25:48 +00:00
|
|
|
min_size, 0);
|
2011-05-03 14:40:22 +00:00
|
|
|
BUG_ON(ret);
|
2011-03-04 19:37:08 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
/*
|
|
|
|
* So if we truncate and then write and fsync we normally would just
|
|
|
|
* write the extents that changed, which is a problem if we need to
|
|
|
|
* first truncate that entire inode. So set this flag so we write out
|
|
|
|
* all of the extents in the inode to the sync log so we're completely
|
|
|
|
* safe.
|
|
|
|
*/
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
|
2012-08-27 21:48:15 +00:00
|
|
|
trans->block_rsv = rsv;
|
2011-08-08 17:46:15 +00:00
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
while (1) {
|
|
|
|
ret = btrfs_truncate_inode_items(trans, root, inode,
|
|
|
|
inode->i_size,
|
|
|
|
BTRFS_EXTENT_DATA_KEY);
|
2017-10-19 18:16:02 +00:00
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
2018-05-22 16:59:50 +00:00
|
|
|
if (ret != -ENOSPC && ret != -EAGAIN)
|
2009-11-12 09:35:36 +00:00
|
|
|
break;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-11-12 09:35:36 +00:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2018-05-22 16:59:50 +00:00
|
|
|
if (ret)
|
2011-01-31 21:03:11 +00:00
|
|
|
break;
|
2012-08-27 21:48:15 +00:00
|
|
|
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2012-08-27 21:48:15 +00:00
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
|
|
if (IS_ERR(trans)) {
|
2018-05-22 16:59:50 +00:00
|
|
|
ret = PTR_ERR(trans);
|
2012-08-27 21:48:15 +00:00
|
|
|
trans = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
btrfs: fix false enospc error when truncating heavily reflinked file
Below test script can reveal this bug:
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=100
dev=$(losetup --show -f fs.img)
mkdir -p /mnt/mntpoint
mkfs.btrfs -f $dev
mount $dev /mnt/mntpoint
cd /mnt/mntpoint
echo "workdir is: /mnt/mntpoint"
blocksize=$((128 * 1024))
dd if=/dev/zero of=testfile bs=$blocksize count=1
sync
count=$((17*1024*1024*1024/blocksize))
echo "file size is:" $((count*blocksize))
for ((i = 1; i <= $count; i++)); do
dst_offset=$((blocksize * i))
xfs_io -f -c "reflink testfile 0 $dst_offset $blocksize"\
testfile > /dev/null
done
sync
truncate --size 0 testfile
The last truncate operation will fail for ENOSPC reason, but indeed
it should not fail.
In btrfs_truncate(), we use a temporary block_rsv to do truncate
operation. With every btrfs_truncate_inode_items() call, we migrate space
to this block_rsv, but forget to cleanup previous reservation, which
will make this block_rsv's reserved bytes keep growing, and this reserved
space will only be released in the end of btrfs_truncate(), this metadata
leak will impact other's metadata reservation. In this case, it's
"btrfs_start_transaction(root, 2);" fails for enospc error, which make
this truncate operation fail.
Call btrfs_block_rsv_release() to fix this bug.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-09-07 12:17:38 +00:00
|
|
|
btrfs_block_rsv_release(fs_info, rsv, -1);
|
2016-06-22 22:54:23 +00:00
|
|
|
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
|
2016-03-25 17:25:48 +00:00
|
|
|
rsv, min_size, 0);
|
2012-08-27 21:48:15 +00:00
|
|
|
BUG_ON(ret); /* shouldn't happen */
|
|
|
|
trans->block_rsv = rsv;
|
2009-11-12 09:35:36 +00:00
|
|
|
}
|
|
|
|
|
2017-10-19 18:16:02 +00:00
|
|
|
/*
|
|
|
|
* We can't call btrfs_truncate_block inside a trans handle as we could
|
|
|
|
* deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
|
|
|
|
* we've truncated everything except the last little bit, and can do
|
|
|
|
* btrfs_truncate_block and then update the disk_i_size.
|
|
|
|
*/
|
|
|
|
if (ret == NEED_TRUNCATE_BLOCK) {
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
|
|
|
|
|
|
|
ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
|
|
|
|
}
|
|
|
|
|
2011-11-08 19:49:59 +00:00
|
|
|
if (trans) {
|
2018-05-22 16:59:50 +00:00
|
|
|
int ret2;
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
2018-05-22 16:59:50 +00:00
|
|
|
ret2 = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret2 && !ret)
|
|
|
|
ret = ret2;
|
2008-07-24 16:17:14 +00:00
|
|
|
|
2018-05-22 16:59:50 +00:00
|
|
|
ret2 = btrfs_end_transaction(trans);
|
|
|
|
if (ret2 && !ret)
|
|
|
|
ret = ret2;
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2011-11-08 19:49:59 +00:00
|
|
|
}
|
2011-05-03 14:40:22 +00:00
|
|
|
out:
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_free_block_rsv(fs_info, rsv);
|
2011-05-03 14:40:22 +00:00
|
|
|
|
2018-05-22 16:59:50 +00:00
|
|
|
return ret;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* create a new subvolume directory/inode (helper for the ioctl).
|
|
|
|
*/
|
2008-12-11 21:30:39 +00:00
|
|
|
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
struct btrfs_root *new_root,
|
|
|
|
struct btrfs_root *parent_root,
|
|
|
|
u64 new_dirid)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2009-09-21 20:00:26 +00:00
|
|
|
int err;
|
2008-08-05 15:18:09 +00:00
|
|
|
u64 index = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2012-02-10 21:15:54 +00:00
|
|
|
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
|
|
|
|
new_dirid, new_dirid,
|
|
|
|
S_IFDIR | (~current_umask() & S_IRWXUGO),
|
|
|
|
&index);
|
2007-06-22 18:16:25 +00:00
|
|
|
if (IS_ERR(inode))
|
2008-06-12 01:53:53 +00:00
|
|
|
return PTR_ERR(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
|
2011-10-28 12:13:29 +00:00
|
|
|
set_nlink(inode, 1);
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), 0);
|
2014-09-08 20:08:51 +00:00
|
|
|
unlock_new_inode(inode);
|
2008-06-10 01:57:42 +00:00
|
|
|
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
|
|
|
|
if (err)
|
|
|
|
btrfs_err(new_root->fs_info,
|
2014-05-15 14:48:20 +00:00
|
|
|
"error inheriting subvolume %llu properties: %d",
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
|
|
|
new_root->root_key.objectid, err);
|
|
|
|
|
2009-09-21 20:00:26 +00:00
|
|
|
err = btrfs_update_inode(trans, new_root, inode);
|
2008-10-09 17:39:39 +00:00
|
|
|
|
2009-09-21 20:00:26 +00:00
|
|
|
iput(inode);
|
2011-07-26 18:32:23 +00:00
|
|
|
return err;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct inode *btrfs_alloc_inode(struct super_block *sb)
|
|
|
|
{
|
2017-10-19 18:15:57 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_inode *ei;
|
2010-05-16 14:46:25 +00:00
|
|
|
struct inode *inode;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2017-10-31 16:08:27 +00:00
|
|
|
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (!ei)
|
|
|
|
return NULL;
|
2010-05-16 14:46:25 +00:00
|
|
|
|
|
|
|
ei->root = NULL;
|
|
|
|
ei->generation = 0;
|
2007-08-10 20:22:09 +00:00
|
|
|
ei->last_trans = 0;
|
2009-10-13 17:21:08 +00:00
|
|
|
ei->last_sub_trans = 0;
|
2008-09-05 20:13:11 +00:00
|
|
|
ei->logged_trans = 0;
|
2010-05-16 14:46:25 +00:00
|
|
|
ei->delalloc_bytes = 0;
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
ei->new_delalloc_bytes = 0;
|
2014-07-03 10:22:07 +00:00
|
|
|
ei->defrag_bytes = 0;
|
2010-05-16 14:46:25 +00:00
|
|
|
ei->disk_i_size = 0;
|
|
|
|
ei->flags = 0;
|
2011-08-04 14:25:02 +00:00
|
|
|
ei->csum_bytes = 0;
|
2010-05-16 14:46:25 +00:00
|
|
|
ei->index_cnt = (u64)-1;
|
2013-12-26 05:07:06 +00:00
|
|
|
ei->dir_index = 0;
|
2010-05-16 14:46:25 +00:00
|
|
|
ei->last_unlink_trans = 0;
|
2012-08-29 07:07:55 +00:00
|
|
|
ei->last_log_commit = 0;
|
2010-05-16 14:46:25 +00:00
|
|
|
|
2011-07-15 15:16:44 +00:00
|
|
|
spin_lock_init(&ei->lock);
|
|
|
|
ei->outstanding_extents = 0;
|
2017-10-19 18:15:57 +00:00
|
|
|
if (sb->s_magic != BTRFS_TEST_MAGIC)
|
|
|
|
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
|
|
|
|
BTRFS_BLOCK_RSV_DELALLOC);
|
2012-05-23 18:13:11 +00:00
|
|
|
ei->runtime_flags = 0;
|
2017-07-17 17:17:20 +00:00
|
|
|
ei->prop_compress = BTRFS_COMPRESS_NONE;
|
2017-07-17 17:41:31 +00:00
|
|
|
ei->defrag_compress = BTRFS_COMPRESS_NONE;
|
2010-05-16 14:46:25 +00:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 10:12:22 +00:00
|
|
|
ei->delayed_node = NULL;
|
|
|
|
|
2012-07-04 07:18:07 +00:00
|
|
|
ei->i_otime.tv_sec = 0;
|
|
|
|
ei->i_otime.tv_nsec = 0;
|
|
|
|
|
2010-05-16 14:46:25 +00:00
|
|
|
inode = &ei->vfs_inode;
|
2011-04-20 22:34:43 +00:00
|
|
|
extent_map_tree_init(&ei->extent_tree);
|
2017-05-05 15:57:13 +00:00
|
|
|
extent_io_tree_init(&ei->io_tree, inode);
|
|
|
|
extent_io_tree_init(&ei->io_failure_tree, inode);
|
2012-03-13 13:38:00 +00:00
|
|
|
ei->io_tree.track_uptodate = 1;
|
|
|
|
ei->io_failure_tree.track_uptodate = 1;
|
2012-11-16 18:56:32 +00:00
|
|
|
atomic_set(&ei->sync_writers, 0);
|
2010-05-16 14:46:25 +00:00
|
|
|
mutex_init(&ei->log_mutex);
|
2012-01-13 17:09:22 +00:00
|
|
|
mutex_init(&ei->delalloc_mutex);
|
2008-07-17 16:53:50 +00:00
|
|
|
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
|
2010-05-16 14:46:25 +00:00
|
|
|
INIT_LIST_HEAD(&ei->delalloc_inodes);
|
2015-11-19 13:15:51 +00:00
|
|
|
INIT_LIST_HEAD(&ei->delayed_iput);
|
2010-05-16 14:46:25 +00:00
|
|
|
RB_CLEAR_NODE(&ei->rb_node);
|
2016-05-12 12:53:36 +00:00
|
|
|
init_rwsem(&ei->dio_sem);
|
2010-05-16 14:46:25 +00:00
|
|
|
|
|
|
|
return inode;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2013-10-11 18:44:09 +00:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
void btrfs_test_destroy_inode(struct inode *inode)
|
|
|
|
{
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
|
2013-10-11 18:44:09 +00:00
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-01-07 06:49:49 +00:00
|
|
|
static void btrfs_i_callback(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
}
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
void btrfs_destroy_inode(struct inode *inode)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
2008-07-17 16:53:50 +00:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2009-03-31 17:27:11 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
2012-06-09 17:51:19 +00:00
|
|
|
WARN_ON(!hlist_empty(&inode->i_dentry));
|
2007-06-12 10:35:45 +00:00
|
|
|
WARN_ON(inode->i_data.nrpages);
|
2017-10-19 18:15:57 +00:00
|
|
|
WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
|
|
|
|
WARN_ON(BTRFS_I(inode)->block_rsv.size);
|
2011-07-15 15:16:44 +00:00
|
|
|
WARN_ON(BTRFS_I(inode)->outstanding_extents);
|
2011-08-04 14:25:02 +00:00
|
|
|
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
|
2011-08-04 14:25:02 +00:00
|
|
|
WARN_ON(BTRFS_I(inode)->csum_bytes);
|
2014-07-03 10:22:07 +00:00
|
|
|
WARN_ON(BTRFS_I(inode)->defrag_bytes);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2009-11-11 20:53:34 +00:00
|
|
|
/*
|
|
|
|
* This can happen where we create an inode, but somebody else also
|
|
|
|
* created the same inode and we need to destroy the one we already
|
|
|
|
* created.
|
|
|
|
*/
|
|
|
|
if (!root)
|
|
|
|
goto free;
|
|
|
|
|
2009-01-06 02:25:51 +00:00
|
|
|
while (1) {
|
2008-07-17 16:53:50 +00:00
|
|
|
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
|
|
|
|
if (!ordered)
|
|
|
|
break;
|
|
|
|
else {
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 14:05:00 +00:00
|
|
|
"found ordered extent %llu %llu on inode cleanup",
|
|
|
|
ordered->file_offset, ordered->len);
|
2008-07-17 16:53:50 +00:00
|
|
|
btrfs_remove_ordered_extent(inode, ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
|
|
|
}
|
2015-10-13 01:53:10 +00:00
|
|
|
btrfs_qgroup_check_reserved_leak(inode);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 14:45:14 +00:00
|
|
|
inode_tree_del(inode);
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
|
2009-11-11 20:53:34 +00:00
|
|
|
free:
|
2011-01-07 06:49:49 +00:00
|
|
|
call_rcu(&inode->i_rcu, btrfs_i_callback);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2010-06-07 17:43:19 +00:00
|
|
|
int btrfs_drop_inode(struct inode *inode)
|
2009-09-21 20:00:26 +00:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-06-07 17:43:19 +00:00
|
|
|
|
2013-06-06 09:56:34 +00:00
|
|
|
if (root == NULL)
|
|
|
|
return 1;
|
|
|
|
|
Btrfs: fix cleaner thread not working with inode cache option
Right now inode cache inode is treated as the same as space cache
inode, ie. keep inode in memory till putting super.
But this leads to an awkward situation.
If we're going to delete a snapshot/subvolume, btrfs will not
actually delete it and return free space, but will add it to dead
roots list until the last inode on this snap/subvol being destroyed.
Then we'll fetch deleted roots and cleanup them via cleaner thread.
So here is the problem, if we enable inode cache option, each
snap/subvol has a cached inode which is used to store inode allcation
information. And this cache inode will be kept in memory, as the above
said. So with inode cache, snap/subvol can only be added into
dead roots list during freeing roots stage in umount, so that we can
ONLY get space back after another remount(we cleanup dead roots on mount).
But the real thing is we'll no more use the snap/subvol if we mark it
deleted, so we can safely iput its cache inode when we delete snap/subvol.
Another thing is that we need to change the rules of droping inode, we
don't keep snap/subvol's cache inode in memory till end so that we can
add snap/subvol into dead roots list in time.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 14:10:23 +00:00
|
|
|
/* the snap/subvol tree is on deleting */
|
2013-09-05 14:58:43 +00:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
2010-06-07 17:43:19 +00:00
|
|
|
return 1;
|
2009-09-21 20:00:26 +00:00
|
|
|
else
|
2010-06-07 17:43:19 +00:00
|
|
|
return generic_drop_inode(inode);
|
2009-09-21 20:00:26 +00:00
|
|
|
}
|
|
|
|
|
2008-07-30 20:54:26 +00:00
|
|
|
static void init_once(void *foo)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
|
|
|
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
|
|
|
|
|
|
|
|
inode_init_once(&ei->vfs_inode);
|
|
|
|
}
|
|
|
|
|
2018-02-19 16:24:18 +00:00
|
|
|
void __cold btrfs_destroy_cachep(void)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2012-09-26 01:33:07 +00:00
|
|
|
/*
|
|
|
|
* Make sure all delayed rcu free inodes are flushed before we
|
|
|
|
* destroy cache.
|
|
|
|
*/
|
|
|
|
rcu_barrier();
|
2016-01-29 13:36:35 +00:00
|
|
|
kmem_cache_destroy(btrfs_inode_cachep);
|
|
|
|
kmem_cache_destroy(btrfs_trans_handle_cachep);
|
|
|
|
kmem_cache_destroy(btrfs_path_cachep);
|
|
|
|
kmem_cache_destroy(btrfs_free_space_cachep);
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
|
|
|
|
2017-11-02 23:21:50 +00:00
|
|
|
int __init btrfs_init_cachep(void)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2012-09-07 09:00:48 +00:00
|
|
|
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
|
2009-04-13 13:33:09 +00:00
|
|
|
sizeof(struct btrfs_inode), 0,
|
2016-01-14 23:18:21 +00:00
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
|
|
|
|
init_once);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (!btrfs_inode_cachep)
|
|
|
|
goto fail;
|
2009-04-13 13:33:09 +00:00
|
|
|
|
2012-09-07 09:00:48 +00:00
|
|
|
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
|
2009-04-13 13:33:09 +00:00
|
|
|
sizeof(struct btrfs_trans_handle), 0,
|
2016-06-23 18:17:08 +00:00
|
|
|
SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (!btrfs_trans_handle_cachep)
|
|
|
|
goto fail;
|
2009-04-13 13:33:09 +00:00
|
|
|
|
2012-09-07 09:00:48 +00:00
|
|
|
btrfs_path_cachep = kmem_cache_create("btrfs_path",
|
2009-04-13 13:33:09 +00:00
|
|
|
sizeof(struct btrfs_path), 0,
|
2016-06-23 18:17:08 +00:00
|
|
|
SLAB_MEM_SPREAD, NULL);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (!btrfs_path_cachep)
|
|
|
|
goto fail;
|
2009-04-13 13:33:09 +00:00
|
|
|
|
2012-09-07 09:00:48 +00:00
|
|
|
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
|
2011-01-28 22:05:48 +00:00
|
|
|
sizeof(struct btrfs_free_space), 0,
|
2016-06-23 18:17:08 +00:00
|
|
|
SLAB_MEM_SPREAD, NULL);
|
2011-01-28 22:05:48 +00:00
|
|
|
if (!btrfs_free_space_cachep)
|
|
|
|
goto fail;
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
return 0;
|
|
|
|
fail:
|
|
|
|
btrfs_destroy_cachep();
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
|
|
|
static int btrfs_getattr(const struct path *path, struct kstat *stat,
|
|
|
|
u32 request_mask, unsigned int flags)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2013-01-29 10:11:59 +00:00
|
|
|
u64 delalloc_bytes;
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
|
|
|
struct inode *inode = d_inode(path->dentry);
|
2011-11-20 12:33:38 +00:00
|
|
|
u32 blocksize = inode->i_sb->s_blocksize;
|
2017-05-12 22:07:43 +00:00
|
|
|
u32 bi_flags = BTRFS_I(inode)->flags;
|
|
|
|
|
|
|
|
stat->result_mask |= STATX_BTIME;
|
|
|
|
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
|
|
|
|
stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
|
|
|
|
if (bi_flags & BTRFS_INODE_APPEND)
|
|
|
|
stat->attributes |= STATX_ATTR_APPEND;
|
|
|
|
if (bi_flags & BTRFS_INODE_COMPRESS)
|
|
|
|
stat->attributes |= STATX_ATTR_COMPRESSED;
|
|
|
|
if (bi_flags & BTRFS_INODE_IMMUTABLE)
|
|
|
|
stat->attributes |= STATX_ATTR_IMMUTABLE;
|
|
|
|
if (bi_flags & BTRFS_INODE_NODUMP)
|
|
|
|
stat->attributes |= STATX_ATTR_NODUMP;
|
|
|
|
|
|
|
|
stat->attributes_mask |= (STATX_ATTR_APPEND |
|
|
|
|
STATX_ATTR_COMPRESSED |
|
|
|
|
STATX_ATTR_IMMUTABLE |
|
|
|
|
STATX_ATTR_NODUMP);
|
2011-11-20 12:33:38 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
generic_fillattr(inode, stat);
|
2011-07-07 19:44:25 +00:00
|
|
|
stat->dev = BTRFS_I(inode)->root->anon_dev;
|
2013-01-29 10:11:59 +00:00
|
|
|
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
Btrfs: fix reported number of inode blocks
Currently when there are buffered writes that were not yet flushed and
they fall within allocated ranges of the file (that is, not in holes or
beyond eof assuming there are no prealloc extents beyond eof), btrfs
simply reports an incorrect number of used blocks through the stat(2)
system call (or any of its variants), regardless of mount options or
inode flags (compress, compress-force, nodatacow). This is because the
number of blocks used that is reported is based on the current number
of bytes in the vfs inode plus the number of dealloc bytes in the btrfs
inode. The later covers bytes that both fall within allocated regions
of the file and holes.
Example scenarios where the number of reported blocks is wrong while the
buffered writes are not flushed:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec)
# The following should have reported 64K...
$ du -h /mnt/sdc/foo1
128K /mnt/sdc/foo1
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo1
64K /mnt/sdc/foo1
$ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 0
64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec)
$ sync
$ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2
wrote 65536/65536 bytes at offset 65536
64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec)
# The following should have reported 128K...
$ du -h /mnt/sdc/foo2
192K /mnt/sdc/foo2
$ sync
# After flushing the buffered write, it now reports the correct value.
$ du -h /mnt/sdc/foo2
128K /mnt/sdc/foo2
So the number of used file blocks is simply incorrect, unlike in other
filesystems such as ext4 and xfs for example, but only while the buffered
writes are not flushed.
Fix this by tracking the number of delalloc bytes that fall within holes
and beyond eof of a file, and use instead this new counter when reporting
the number of used blocks for an inode.
Another different problem that exists is that the delalloc bytes counter
is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from
the respective range in the inode's iotree) and the vfs inode's bytes
counter is only incremented when writeback finishes (through
insert_reserved_file_extent()). Therefore while writeback is ongoing we
simply report a wrong number of blocks used by an inode if the write
operation covers a range previously unallocated. While this change does
not fix this problem, it does minimizes it a lot by shortening that time
window, as the new dealloc bytes counter (new_delalloc_bytes) is only
decremented when writeback finishes right before updating the vfs inode's
bytes counter. Fully fixing this second problem is not trivial and will
be addressed later by a different patch.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2017-04-03 09:45:46 +00:00
|
|
|
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
|
2013-01-29 10:11:59 +00:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2011-11-20 12:33:38 +00:00
|
|
|
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
|
2013-01-29 10:11:59 +00:00
|
|
|
ALIGN(delalloc_bytes, blocksize)) >> 9;
|
2007-06-12 10:35:45 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-03-17 14:23:38 +00:00
|
|
|
static int btrfs_rename_exchange(struct inode *old_dir,
|
|
|
|
struct dentry *old_dentry,
|
|
|
|
struct inode *new_dir,
|
|
|
|
struct dentry *new_dentry)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
|
2016-03-17 14:23:38 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(old_dir)->root;
|
|
|
|
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
|
|
|
|
struct inode *new_inode = new_dentry->d_inode;
|
|
|
|
struct inode *old_inode = old_dentry->d_inode;
|
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-09 02:36:02 +00:00
|
|
|
struct timespec64 ctime = current_time(old_inode);
|
2016-03-17 14:23:38 +00:00
|
|
|
struct dentry *parent;
|
2017-01-10 18:35:31 +00:00
|
|
|
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
|
|
|
|
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
|
2016-03-17 14:23:38 +00:00
|
|
|
u64 old_idx = 0;
|
|
|
|
u64 new_idx = 0;
|
|
|
|
u64 root_objectid;
|
|
|
|
int ret;
|
2018-06-11 18:24:16 +00:00
|
|
|
int ret2;
|
2016-05-05 01:02:27 +00:00
|
|
|
bool root_log_pinned = false;
|
|
|
|
bool dest_log_pinned = false;
|
2016-03-17 14:23:38 +00:00
|
|
|
|
|
|
|
/* we only allow rename subvolume link between subvolumes */
|
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
|
|
|
|
return -EXDEV;
|
|
|
|
|
|
|
|
/* close the race window with snapshot create/destroy ioctl */
|
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 22:54:23 +00:00
|
|
|
down_read(&fs_info->subvol_sem);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 22:54:23 +00:00
|
|
|
down_read(&fs_info->subvol_sem);
|
2016-03-17 14:23:38 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We want to reserve the absolute worst case amount of items. So if
|
|
|
|
* both inodes are subvols and we need to unlink them then that would
|
|
|
|
* require 4 item modifications, but if they are both normal inodes it
|
|
|
|
* would require 5 item modifications, so we'll assume their normal
|
|
|
|
* inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
|
|
|
|
* should cover the worst case number of items we'll modify.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 12);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out_notrans;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to find a free sequence number both in the source and
|
|
|
|
* in the destination directory for the exchange.
|
|
|
|
*/
|
2017-02-20 11:50:33 +00:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2017-02-20 11:50:33 +00:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
|
|
|
|
|
|
|
BTRFS_I(old_inode)->dir_index = 0ULL;
|
|
|
|
BTRFS_I(new_inode)->dir_index = 0ULL;
|
|
|
|
|
|
|
|
/* Reference for the source. */
|
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
/* force full log commit if subvolume involved. */
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_set_log_full_commit(fs_info, trans);
|
2016-03-17 14:23:38 +00:00
|
|
|
} else {
|
2016-05-05 01:08:56 +00:00
|
|
|
btrfs_pin_log_trans(root);
|
|
|
|
root_log_pinned = true;
|
2016-03-17 14:23:38 +00:00
|
|
|
ret = btrfs_insert_inode_ref(trans, dest,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len,
|
|
|
|
old_ino,
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_ino(BTRFS_I(new_dir)),
|
|
|
|
old_idx);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* And now for the dest. */
|
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
/* force full log commit if subvolume involved. */
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_set_log_full_commit(fs_info, trans);
|
2016-03-17 14:23:38 +00:00
|
|
|
} else {
|
2016-05-05 01:08:56 +00:00
|
|
|
btrfs_pin_log_trans(dest);
|
|
|
|
dest_log_pinned = true;
|
2016-03-17 14:23:38 +00:00
|
|
|
ret = btrfs_insert_inode_ref(trans, root,
|
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len,
|
|
|
|
new_ino,
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_ino(BTRFS_I(old_dir)),
|
|
|
|
new_idx);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Update inode version and ctime/mtime. */
|
|
|
|
inode_inc_iversion(old_dir);
|
|
|
|
inode_inc_iversion(new_dir);
|
|
|
|
inode_inc_iversion(old_inode);
|
|
|
|
inode_inc_iversion(new_inode);
|
|
|
|
old_dir->i_ctime = old_dir->i_mtime = ctime;
|
|
|
|
new_dir->i_ctime = new_dir->i_mtime = ctime;
|
|
|
|
old_inode->i_ctime = ctime;
|
|
|
|
new_inode->i_ctime = ctime;
|
|
|
|
|
|
|
|
if (old_dentry->d_parent != new_dentry->d_parent) {
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
|
|
|
|
BTRFS_I(old_inode), 1);
|
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
|
|
|
|
BTRFS_I(new_inode), 1);
|
2016-03-17 14:23:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* src is a subvolume */
|
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
|
2018-08-01 03:32:30 +00:00
|
|
|
ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
|
2016-03-17 14:23:38 +00:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
} else { /* src is an inode */
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
|
|
|
|
BTRFS_I(old_dentry->d_inode),
|
2016-03-17 14:23:38 +00:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_update_inode(trans, root, old_inode);
|
|
|
|
}
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 14:23:38 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* dest is a subvolume */
|
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
|
2018-08-01 03:32:30 +00:00
|
|
|
ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
|
2016-03-17 14:23:38 +00:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
} else { /* dest is an inode */
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
|
|
|
|
BTRFS_I(new_dentry->d_inode),
|
2016-03-17 14:23:38 +00:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_update_inode(trans, dest, new_inode);
|
|
|
|
}
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 14:23:38 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:51:08 +00:00
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
|
2016-03-17 14:23:38 +00:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len, 0, old_idx);
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 14:23:38 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
|
|
|
|
2017-02-20 11:51:08 +00:00
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
|
2016-03-17 14:23:38 +00:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len, 0, new_idx);
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 14:23:38 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (old_inode->i_nlink == 1)
|
|
|
|
BTRFS_I(old_inode)->dir_index = old_idx;
|
|
|
|
if (new_inode->i_nlink == 1)
|
|
|
|
BTRFS_I(new_inode)->dir_index = new_idx;
|
|
|
|
|
2016-05-05 01:02:27 +00:00
|
|
|
if (root_log_pinned) {
|
2016-03-17 14:23:38 +00:00
|
|
|
parent = new_dentry->d_parent;
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
|
|
|
|
parent);
|
2016-03-17 14:23:38 +00:00
|
|
|
btrfs_end_log_trans(root);
|
2016-05-05 01:02:27 +00:00
|
|
|
root_log_pinned = false;
|
2016-03-17 14:23:38 +00:00
|
|
|
}
|
2016-05-05 01:02:27 +00:00
|
|
|
if (dest_log_pinned) {
|
2016-03-17 14:23:38 +00:00
|
|
|
parent = old_dentry->d_parent;
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
|
|
|
|
parent);
|
2016-03-17 14:23:38 +00:00
|
|
|
btrfs_end_log_trans(dest);
|
2016-05-05 01:02:27 +00:00
|
|
|
dest_log_pinned = false;
|
2016-03-17 14:23:38 +00:00
|
|
|
}
|
|
|
|
out_fail:
|
2016-05-05 01:02:27 +00:00
|
|
|
/*
|
|
|
|
* If we have pinned a log and an error happened, we unpin tasks
|
|
|
|
* trying to sync the log and force them to fallback to a transaction
|
|
|
|
* commit if the log currently contains any of the inodes involved in
|
|
|
|
* this rename operation (to ensure we do not persist a log with an
|
|
|
|
* inconsistent state for any of these inodes or leading to any
|
|
|
|
* inconsistencies when replayed). If the transaction was aborted, the
|
|
|
|
* abortion reason is propagated to userspace when attempting to commit
|
|
|
|
* the transaction. If the log does not contain any of these inodes, we
|
|
|
|
* allow the tasks to sync it.
|
|
|
|
*/
|
|
|
|
if (ret && (root_log_pinned || dest_log_pinned)) {
|
2017-01-17 22:31:30 +00:00
|
|
|
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
|
|
|
|
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
|
|
|
|
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
|
2016-05-05 01:02:27 +00:00
|
|
|
(new_inode &&
|
2017-01-17 22:31:30 +00:00
|
|
|
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_set_log_full_commit(fs_info, trans);
|
2016-05-05 01:02:27 +00:00
|
|
|
|
|
|
|
if (root_log_pinned) {
|
|
|
|
btrfs_end_log_trans(root);
|
|
|
|
root_log_pinned = false;
|
|
|
|
}
|
|
|
|
if (dest_log_pinned) {
|
|
|
|
btrfs_end_log_trans(dest);
|
|
|
|
dest_log_pinned = false;
|
|
|
|
}
|
|
|
|
}
|
2018-06-11 18:24:16 +00:00
|
|
|
ret2 = btrfs_end_transaction(trans);
|
|
|
|
ret = ret ? ret : ret2;
|
2016-03-17 14:23:38 +00:00
|
|
|
out_notrans:
|
|
|
|
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 22:54:23 +00:00
|
|
|
up_read(&fs_info->subvol_sem);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 22:54:23 +00:00
|
|
|
up_read(&fs_info->subvol_sem);
|
2016-03-17 14:23:38 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir,
|
|
|
|
struct dentry *dentry)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 objectid;
|
|
|
|
u64 index;
|
|
|
|
|
|
|
|
ret = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
inode = btrfs_new_inode(trans, root, dir,
|
|
|
|
dentry->d_name.name,
|
|
|
|
dentry->d_name.len,
|
2017-01-10 18:35:31 +00:00
|
|
|
btrfs_ino(BTRFS_I(dir)),
|
2016-03-17 14:23:38 +00:00
|
|
|
objectid,
|
|
|
|
S_IFCHR | WHITEOUT_MODE,
|
|
|
|
&index);
|
|
|
|
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
ret = PTR_ERR(inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
|
|
|
init_special_inode(inode, inode->i_mode,
|
|
|
|
WHITEOUT_DEV);
|
|
|
|
|
|
|
|
ret = btrfs_init_inode_security(trans, inode, dir,
|
|
|
|
&dentry->d_name);
|
|
|
|
if (ret)
|
2016-05-05 00:41:57 +00:00
|
|
|
goto out;
|
2016-03-17 14:23:38 +00:00
|
|
|
|
2017-02-20 11:51:09 +00:00
|
|
|
ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
|
|
|
|
BTRFS_I(inode), 0, index);
|
2016-03-17 14:23:38 +00:00
|
|
|
if (ret)
|
2016-05-05 00:41:57 +00:00
|
|
|
goto out;
|
2016-03-17 14:23:38 +00:00
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2016-05-05 00:41:57 +00:00
|
|
|
out:
|
2016-03-17 14:23:38 +00:00
|
|
|
unlock_new_inode(inode);
|
2016-05-05 00:41:57 +00:00
|
|
|
if (ret)
|
|
|
|
inode_dec_link_count(inode);
|
2016-03-17 14:23:38 +00:00
|
|
|
iput(inode);
|
|
|
|
|
2016-05-05 00:41:57 +00:00
|
|
|
return ret;
|
2016-03-17 14:23:38 +00:00
|
|
|
}
|
|
|
|
|
2009-01-06 02:25:51 +00:00
|
|
|
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
2016-03-17 14:23:38 +00:00
|
|
|
struct inode *new_dir, struct dentry *new_dentry,
|
|
|
|
unsigned int flags)
|
2007-06-12 10:35:45 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
2016-05-05 09:26:26 +00:00
|
|
|
unsigned int trans_num_items;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(old_dir)->root;
|
2009-09-21 19:56:00 +00:00
|
|
|
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *new_inode = d_inode(new_dentry);
|
|
|
|
struct inode *old_inode = d_inode(old_dentry);
|
2008-08-05 15:18:09 +00:00
|
|
|
u64 index = 0;
|
2009-09-21 19:56:00 +00:00
|
|
|
u64 root_objectid;
|
2007-06-12 10:35:45 +00:00
|
|
|
int ret;
|
2017-01-10 18:35:31 +00:00
|
|
|
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
|
2016-04-29 10:34:22 +00:00
|
|
|
bool log_pinned = false;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2017-01-10 18:35:31 +00:00
|
|
|
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2009-09-24 13:17:31 +00:00
|
|
|
return -EPERM;
|
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
/* we only allow rename subvolume link between subvolumes */
|
2011-04-20 02:31:50 +00:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
|
2008-11-18 01:42:26 +00:00
|
|
|
return -EXDEV;
|
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
|
2017-01-10 18:35:31 +00:00
|
|
|
(new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
|
2007-06-12 10:35:45 +00:00
|
|
|
return -ENOTEMPTY;
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
if (S_ISDIR(old_inode->i_mode) && new_inode &&
|
|
|
|
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
|
|
|
return -ENOTEMPTY;
|
2012-12-17 19:26:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
/* check for collisions, even if the name isn't there */
|
2013-10-09 16:24:04 +00:00
|
|
|
ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
|
2012-12-17 19:26:57 +00:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
if (ret == -EEXIST) {
|
|
|
|
/* we shouldn't get
|
|
|
|
* eexist without a new_inode */
|
2013-10-31 05:00:08 +00:00
|
|
|
if (WARN_ON(!new_inode)) {
|
2012-12-17 19:26:57 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* maybe -EOVERFLOW */
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
|
2009-03-31 17:27:11 +00:00
|
|
|
/*
|
2014-08-12 17:47:42 +00:00
|
|
|
* we're using rename to replace one file with another. Start IO on it
|
|
|
|
* now so we don't add too much work to the end of the transaction
|
2009-03-31 17:27:11 +00:00
|
|
|
*/
|
2014-08-12 17:47:42 +00:00
|
|
|
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
|
2009-03-31 17:27:11 +00:00
|
|
|
filemap_flush(old_inode->i_mapping);
|
|
|
|
|
2009-09-21 20:00:26 +00:00
|
|
|
/* close the racy window with snapshot create/destroy ioctl */
|
2011-04-20 02:31:50 +00:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 22:54:23 +00:00
|
|
|
down_read(&fs_info->subvol_sem);
|
2010-05-16 14:48:46 +00:00
|
|
|
/*
|
|
|
|
* We want to reserve the absolute worst case amount of items. So if
|
|
|
|
* both inodes are subvols and we need to unlink them then that would
|
|
|
|
* require 4 item modifications, but if they are both normal inodes it
|
2016-03-17 14:23:38 +00:00
|
|
|
* would require 5 item modifications, so we'll assume they are normal
|
2010-05-16 14:48:46 +00:00
|
|
|
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
|
|
|
|
* should cover the worst case number of items we'll modify.
|
2016-05-05 09:26:26 +00:00
|
|
|
* If our rename has the whiteout flag, we need more 5 units for the
|
|
|
|
* new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
|
|
|
|
* when selinux is enabled).
|
2010-05-16 14:48:46 +00:00
|
|
|
*/
|
2016-05-05 09:26:26 +00:00
|
|
|
trans_num_items = 11;
|
|
|
|
if (flags & RENAME_WHITEOUT)
|
|
|
|
trans_num_items += 5;
|
|
|
|
trans = btrfs_start_transaction(root, trans_num_items);
|
2011-03-31 13:23:47 +00:00
|
|
|
if (IS_ERR(trans)) {
|
2016-03-17 14:23:38 +00:00
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out_notrans;
|
|
|
|
}
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2009-09-21 19:56:00 +00:00
|
|
|
if (dest != root)
|
|
|
|
btrfs_record_root_in_trans(trans, dest);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2017-02-20 11:50:33 +00:00
|
|
|
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
|
2009-09-24 13:17:31 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2009-03-31 17:27:11 +00:00
|
|
|
|
2013-12-26 05:07:06 +00:00
|
|
|
BTRFS_I(old_inode)->dir_index = 0ULL;
|
2011-04-20 02:31:50 +00:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 19:56:00 +00:00
|
|
|
/* force full log commit if subvolume involved. */
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_set_log_full_commit(fs_info, trans);
|
2009-09-21 19:56:00 +00:00
|
|
|
} else {
|
2016-04-29 12:14:42 +00:00
|
|
|
btrfs_pin_log_trans(root);
|
|
|
|
log_pinned = true;
|
2009-09-24 13:17:31 +00:00
|
|
|
ret = btrfs_insert_inode_ref(trans, dest,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len,
|
2011-04-20 02:31:50 +00:00
|
|
|
old_ino,
|
2017-01-10 18:35:31 +00:00
|
|
|
btrfs_ino(BTRFS_I(new_dir)), index);
|
2009-09-24 13:17:31 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2009-09-21 19:56:00 +00:00
|
|
|
}
|
2009-03-31 17:27:11 +00:00
|
|
|
|
2012-04-05 19:03:02 +00:00
|
|
|
inode_inc_iversion(old_dir);
|
|
|
|
inode_inc_iversion(new_dir);
|
|
|
|
inode_inc_iversion(old_inode);
|
2016-02-07 07:57:21 +00:00
|
|
|
old_dir->i_ctime = old_dir->i_mtime =
|
|
|
|
new_dir->i_ctime = new_dir->i_mtime =
|
2016-09-14 14:48:06 +00:00
|
|
|
old_inode->i_ctime = current_time(old_dir);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2009-03-24 14:24:20 +00:00
|
|
|
if (old_dentry->d_parent != new_dentry->d_parent)
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
|
|
|
|
BTRFS_I(old_inode), 1);
|
2009-03-24 14:24:20 +00:00
|
|
|
|
2011-04-20 02:31:50 +00:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 19:56:00 +00:00
|
|
|
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
|
2018-08-01 03:32:30 +00:00
|
|
|
ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
|
2009-09-21 19:56:00 +00:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
} else {
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
|
|
|
|
BTRFS_I(d_inode(old_dentry)),
|
2011-03-04 17:14:37 +00:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_update_inode(trans, root, old_inode);
|
2009-09-21 19:56:00 +00:00
|
|
|
}
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
if (new_inode) {
|
2012-04-05 19:03:02 +00:00
|
|
|
inode_inc_iversion(new_inode);
|
2016-09-14 14:48:06 +00:00
|
|
|
new_inode->i_ctime = current_time(new_inode);
|
2017-01-10 18:35:31 +00:00
|
|
|
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
|
2009-09-21 19:56:00 +00:00
|
|
|
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
|
|
|
root_objectid = BTRFS_I(new_inode)->location.objectid;
|
2018-08-01 03:32:30 +00:00
|
|
|
ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
|
2009-09-21 19:56:00 +00:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
BUG_ON(new_inode->i_nlink == 0);
|
|
|
|
} else {
|
2017-01-17 22:31:44 +00:00
|
|
|
ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
|
|
|
|
BTRFS_I(d_inode(new_dentry)),
|
2009-09-21 19:56:00 +00:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
}
|
2013-08-13 18:10:08 +00:00
|
|
|
if (!ret && new_inode->i_nlink == 0)
|
2017-02-20 11:50:59 +00:00
|
|
|
ret = btrfs_orphan_add(trans,
|
|
|
|
BTRFS_I(d_inode(new_dentry)));
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2008-07-24 16:12:38 +00:00
|
|
|
|
2017-02-20 11:51:08 +00:00
|
|
|
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
|
2009-09-21 19:56:00 +00:00
|
|
|
new_dentry->d_name.name,
|
2009-09-24 13:17:31 +00:00
|
|
|
new_dentry->d_name.len, 0, index);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2013-12-26 05:07:06 +00:00
|
|
|
if (old_inode->i_nlink == 1)
|
|
|
|
BTRFS_I(old_inode)->dir_index = index;
|
|
|
|
|
2016-04-29 10:34:22 +00:00
|
|
|
if (log_pinned) {
|
2011-07-17 03:09:10 +00:00
|
|
|
struct dentry *parent = new_dentry->d_parent;
|
2016-04-29 10:34:22 +00:00
|
|
|
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
|
|
|
|
parent);
|
2009-09-21 19:56:00 +00:00
|
|
|
btrfs_end_log_trans(root);
|
2016-04-29 10:34:22 +00:00
|
|
|
log_pinned = false;
|
2009-09-21 19:56:00 +00:00
|
|
|
}
|
2016-03-17 14:23:38 +00:00
|
|
|
|
|
|
|
if (flags & RENAME_WHITEOUT) {
|
|
|
|
ret = btrfs_whiteout_for_rename(trans, root, old_dir,
|
|
|
|
old_dentry);
|
|
|
|
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2016-03-17 14:23:38 +00:00
|
|
|
goto out_fail;
|
|
|
|
}
|
2009-09-21 19:56:00 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
out_fail:
|
2016-04-29 10:34:22 +00:00
|
|
|
/*
|
|
|
|
* If we have pinned the log and an error happened, we unpin tasks
|
|
|
|
* trying to sync the log and force them to fallback to a transaction
|
|
|
|
* commit if the log currently contains any of the inodes involved in
|
|
|
|
* this rename operation (to ensure we do not persist a log with an
|
|
|
|
* inconsistent state for any of these inodes or leading to any
|
|
|
|
* inconsistencies when replayed). If the transaction was aborted, the
|
|
|
|
* abortion reason is propagated to userspace when attempting to commit
|
|
|
|
* the transaction. If the log does not contain any of these inodes, we
|
|
|
|
* allow the tasks to sync it.
|
|
|
|
*/
|
|
|
|
if (ret && log_pinned) {
|
2017-01-17 22:31:30 +00:00
|
|
|
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
|
|
|
|
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
|
|
|
|
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
|
2016-04-29 10:34:22 +00:00
|
|
|
(new_inode &&
|
2017-01-17 22:31:30 +00:00
|
|
|
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_set_log_full_commit(fs_info, trans);
|
2016-04-29 10:34:22 +00:00
|
|
|
|
|
|
|
btrfs_end_log_trans(root);
|
|
|
|
log_pinned = false;
|
|
|
|
}
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2011-03-31 13:23:47 +00:00
|
|
|
out_notrans:
|
2011-04-20 02:31:50 +00:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2016-06-22 22:54:23 +00:00
|
|
|
up_read(&fs_info->subvol_sem);
|
2009-09-11 20:12:44 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-07-23 13:15:32 +00:00
|
|
|
static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
|
|
|
|
struct inode *new_dir, struct dentry *new_dentry,
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
2016-03-17 14:23:38 +00:00
|
|
|
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
|
2014-07-23 13:15:32 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2016-03-17 14:23:38 +00:00
|
|
|
if (flags & RENAME_EXCHANGE)
|
|
|
|
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
|
|
|
|
new_dentry);
|
|
|
|
|
|
|
|
return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
|
2014-07-23 13:15:32 +00:00
|
|
|
}
|
|
|
|
|
2018-04-24 14:23:59 +00:00
|
|
|
struct btrfs_delalloc_work {
|
|
|
|
struct inode *inode;
|
|
|
|
struct completion completion;
|
|
|
|
struct list_head list;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
2012-10-25 09:28:04 +00:00
|
|
|
static void btrfs_run_delalloc_work(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct btrfs_delalloc_work *delalloc_work;
|
2013-10-28 19:03:41 +00:00
|
|
|
struct inode *inode;
|
2012-10-25 09:28:04 +00:00
|
|
|
|
|
|
|
delalloc_work = container_of(work, struct btrfs_delalloc_work,
|
|
|
|
work);
|
2013-10-28 19:03:41 +00:00
|
|
|
inode = delalloc_work->inode;
|
2015-11-27 18:27:11 +00:00
|
|
|
filemap_flush(inode->i_mapping);
|
|
|
|
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2013-10-28 19:03:41 +00:00
|
|
|
filemap_flush(inode->i_mapping);
|
2012-10-25 09:28:04 +00:00
|
|
|
|
2018-04-23 07:54:16 +00:00
|
|
|
iput(inode);
|
2012-10-25 09:28:04 +00:00
|
|
|
complete(&delalloc_work->completion);
|
|
|
|
}
|
|
|
|
|
2018-04-24 14:23:59 +00:00
|
|
|
static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
|
2012-10-25 09:28:04 +00:00
|
|
|
{
|
|
|
|
struct btrfs_delalloc_work *work;
|
|
|
|
|
2015-12-08 13:39:32 +00:00
|
|
|
work = kmalloc(sizeof(*work), GFP_NOFS);
|
2012-10-25 09:28:04 +00:00
|
|
|
if (!work)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
init_completion(&work->completion);
|
|
|
|
INIT_LIST_HEAD(&work->list);
|
|
|
|
work->inode = inode;
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
|
|
|
WARN_ON_ONCE(!inode);
|
|
|
|
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
|
|
|
|
btrfs_run_delalloc_work, NULL, NULL);
|
2012-10-25 09:28:04 +00:00
|
|
|
|
|
|
|
return work;
|
|
|
|
}
|
|
|
|
|
2008-09-29 19:18:18 +00:00
|
|
|
/*
|
|
|
|
* some fairly slow code that needs optimization. This walks the list
|
|
|
|
* of all the inodes with pending delalloc and forces them to disk.
|
|
|
|
*/
|
2018-04-23 07:54:15 +00:00
|
|
|
static int start_delalloc_inodes(struct btrfs_root *root, int nr)
|
2008-08-05 03:17:27 +00:00
|
|
|
{
|
|
|
|
struct btrfs_inode *binode;
|
2008-09-26 14:05:38 +00:00
|
|
|
struct inode *inode;
|
2012-10-25 09:28:04 +00:00
|
|
|
struct btrfs_delalloc_work *work, *next;
|
|
|
|
struct list_head works;
|
2013-01-22 10:49:00 +00:00
|
|
|
struct list_head splice;
|
2012-10-25 09:28:04 +00:00
|
|
|
int ret = 0;
|
2008-08-05 03:17:27 +00:00
|
|
|
|
2012-10-25 09:28:04 +00:00
|
|
|
INIT_LIST_HEAD(&works);
|
2013-01-22 10:49:00 +00:00
|
|
|
INIT_LIST_HEAD(&splice);
|
2013-01-22 10:50:35 +00:00
|
|
|
|
2014-03-06 05:55:03 +00:00
|
|
|
mutex_lock(&root->delalloc_mutex);
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
list_splice_init(&root->delalloc_inodes, &splice);
|
2013-01-22 10:49:00 +00:00
|
|
|
while (!list_empty(&splice)) {
|
|
|
|
binode = list_entry(splice.next, struct btrfs_inode,
|
2008-08-05 03:17:27 +00:00
|
|
|
delalloc_inodes);
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2013-05-15 07:48:22 +00:00
|
|
|
list_move_tail(&binode->delalloc_inodes,
|
|
|
|
&root->delalloc_inodes);
|
2008-09-26 14:05:38 +00:00
|
|
|
inode = igrab(&binode->vfs_inode);
|
2013-01-29 10:11:59 +00:00
|
|
|
if (!inode) {
|
2013-05-15 07:48:22 +00:00
|
|
|
cond_resched_lock(&root->delalloc_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
continue;
|
2013-01-29 10:11:59 +00:00
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2018-04-23 07:54:16 +00:00
|
|
|
work = btrfs_alloc_delalloc_work(inode);
|
2014-09-29 17:20:37 +00:00
|
|
|
if (!work) {
|
2018-04-23 07:54:15 +00:00
|
|
|
iput(inode);
|
2013-01-22 10:49:00 +00:00
|
|
|
ret = -ENOMEM;
|
2014-04-02 11:53:32 +00:00
|
|
|
goto out;
|
2008-09-26 14:05:38 +00:00
|
|
|
}
|
2013-01-22 10:49:00 +00:00
|
|
|
list_add_tail(&work->list, &works);
|
2014-02-28 02:46:09 +00:00
|
|
|
btrfs_queue_work(root->fs_info->flush_workers,
|
|
|
|
&work->work);
|
2014-03-06 05:55:01 +00:00
|
|
|
ret++;
|
|
|
|
if (nr != -1 && ret >= nr)
|
2014-04-02 11:53:32 +00:00
|
|
|
goto out;
|
2008-09-26 14:05:38 +00:00
|
|
|
cond_resched();
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
2008-08-05 03:17:27 +00:00
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
2008-09-29 15:19:10 +00:00
|
|
|
|
2014-04-02 11:53:32 +00:00
|
|
|
out:
|
2013-05-15 07:48:22 +00:00
|
|
|
list_for_each_entry_safe(work, next, &works, list) {
|
|
|
|
list_del_init(&work->list);
|
2018-04-19 07:46:39 +00:00
|
|
|
wait_for_completion(&work->completion);
|
|
|
|
kfree(work);
|
2013-05-15 07:48:22 +00:00
|
|
|
}
|
|
|
|
|
2018-04-19 07:46:37 +00:00
|
|
|
if (!list_empty(&splice)) {
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
list_splice_tail(&splice, &root->delalloc_inodes);
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
}
|
2014-03-06 05:55:03 +00:00
|
|
|
mutex_unlock(&root->delalloc_mutex);
|
2013-05-15 07:48:22 +00:00
|
|
|
return ret;
|
|
|
|
}
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2018-04-23 07:54:14 +00:00
|
|
|
int btrfs_start_delalloc_inodes(struct btrfs_root *root)
|
2013-05-15 07:48:22 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2013-05-15 07:48:22 +00:00
|
|
|
int ret;
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
|
2013-05-15 07:48:22 +00:00
|
|
|
return -EROFS;
|
|
|
|
|
2018-04-23 07:54:15 +00:00
|
|
|
ret = start_delalloc_inodes(root, -1);
|
2014-03-06 05:55:01 +00:00
|
|
|
if (ret > 0)
|
|
|
|
ret = 0;
|
2013-05-15 07:48:22 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-04-23 07:54:13 +00:00
|
|
|
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
|
2013-05-15 07:48:22 +00:00
|
|
|
{
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct list_head splice;
|
|
|
|
int ret;
|
|
|
|
|
2014-01-14 11:42:20 +00:00
|
|
|
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
|
2013-05-15 07:48:22 +00:00
|
|
|
return -EROFS;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&splice);
|
|
|
|
|
2014-03-06 05:55:03 +00:00
|
|
|
mutex_lock(&fs_info->delalloc_root_mutex);
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
|
|
|
list_splice_init(&fs_info->delalloc_roots, &splice);
|
2014-03-06 05:55:01 +00:00
|
|
|
while (!list_empty(&splice) && nr) {
|
2013-05-15 07:48:22 +00:00
|
|
|
root = list_first_entry(&splice, struct btrfs_root,
|
|
|
|
delalloc_root);
|
|
|
|
root = btrfs_grab_fs_root(root);
|
|
|
|
BUG_ON(!root);
|
|
|
|
list_move_tail(&root->delalloc_root,
|
|
|
|
&fs_info->delalloc_roots);
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
|
|
|
|
2018-04-23 07:54:15 +00:00
|
|
|
ret = start_delalloc_inodes(root, nr);
|
2013-05-15 07:48:22 +00:00
|
|
|
btrfs_put_fs_root(root);
|
2014-03-06 05:55:01 +00:00
|
|
|
if (ret < 0)
|
2013-05-15 07:48:22 +00:00
|
|
|
goto out;
|
|
|
|
|
2014-03-06 05:55:01 +00:00
|
|
|
if (nr != -1) {
|
|
|
|
nr -= ret;
|
|
|
|
WARN_ON(nr < 0);
|
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2012-10-25 09:28:04 +00:00
|
|
|
}
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
|
2014-03-06 05:55:01 +00:00
|
|
|
ret = 0;
|
2013-05-15 07:48:22 +00:00
|
|
|
out:
|
2018-04-19 07:46:37 +00:00
|
|
|
if (!list_empty(&splice)) {
|
2013-05-15 07:48:22 +00:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
|
|
|
list_splice_tail(&splice, &fs_info->delalloc_roots);
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-01-22 10:49:00 +00:00
|
|
|
}
|
2014-03-06 05:55:03 +00:00
|
|
|
mutex_unlock(&fs_info->delalloc_root_mutex);
|
2012-10-25 09:28:04 +00:00
|
|
|
return ret;
|
2008-08-05 03:17:27 +00:00
|
|
|
}
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
|
|
|
|
const char *symname)
|
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2007-12-21 21:27:21 +00:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 10:35:45 +00:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
u64 objectid;
|
2013-10-31 05:03:04 +00:00
|
|
|
u64 index = 0;
|
2007-06-12 10:35:45 +00:00
|
|
|
int name_len;
|
|
|
|
int datasize;
|
2007-10-15 20:14:19 +00:00
|
|
|
unsigned long ptr;
|
2007-06-12 10:35:45 +00:00
|
|
|
struct btrfs_file_extent_item *ei;
|
2007-10-15 20:14:19 +00:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2013-09-16 08:53:28 +00:00
|
|
|
name_len = strlen(symname);
|
2016-06-22 22:54:23 +00:00
|
|
|
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
|
2007-06-12 10:35:45 +00:00
|
|
|
return -ENAMETOOLONG;
|
2007-12-21 21:27:21 +00:00
|
|
|
|
2009-09-11 20:12:44 +00:00
|
|
|
/*
|
|
|
|
* 2 items for inode item and ref
|
|
|
|
* 2 items for dir items
|
2015-12-31 18:16:29 +00:00
|
|
|
* 1 item for updating parent inode item
|
|
|
|
* 1 item for the inline extent item
|
2009-09-11 20:12:44 +00:00
|
|
|
* 1 item for xattr if selinux is on
|
|
|
|
*/
|
2015-12-31 18:16:29 +00:00
|
|
|
trans = btrfs_start_transaction(root, 7);
|
2010-05-16 14:48:46 +00:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-12-21 21:27:21 +00:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 02:06:11 +00:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-24 16:12:38 +00:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2017-01-20 13:54:07 +00:00
|
|
|
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
|
|
|
|
objectid, S_IFLNK|S_IRWXUGO, &index);
|
2011-04-25 23:43:53 +00:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
goto out_unlock;
|
2011-04-25 23:43:53 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2011-12-15 15:09:07 +00:00
|
|
|
/*
|
|
|
|
* If the active LSM wants to access the inode during
|
|
|
|
* d_instantiate it needs these. Smack checks to see
|
|
|
|
* if the filesystem supports xattrs by looking at the
|
|
|
|
* ops vector.
|
|
|
|
*/
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
2014-09-08 20:08:51 +00:00
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
|
|
|
|
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock_inode;
|
2011-12-15 15:09:07 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 17:38:47 +00:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_unlock_inode;
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 17:38:47 +00:00
|
|
|
}
|
2017-01-10 18:35:31 +00:00
|
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
2007-06-12 10:35:45 +00:00
|
|
|
key.offset = 0;
|
2014-06-04 16:41:45 +00:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
2007-06-12 10:35:45 +00:00
|
|
|
datasize = btrfs_file_extent_calc_inline_size(name_len);
|
|
|
|
err = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
datasize);
|
2007-06-22 18:16:25 +00:00
|
|
|
if (err) {
|
2011-05-14 07:10:51 +00:00
|
|
|
btrfs_free_path(path);
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_unlock_inode;
|
2007-06-22 18:16:25 +00:00
|
|
|
}
|
2007-10-15 20:14:19 +00:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, ei,
|
2007-06-12 10:35:45 +00:00
|
|
|
BTRFS_FILE_EXTENT_INLINE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
|
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
2007-10-15 20:14:19 +00:00
|
|
|
write_extent_buffer(leaf, symname, ptr, name_len);
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 10:35:45 +00:00
|
|
|
btrfs_free_path(path);
|
2007-10-15 20:14:19 +00:00
|
|
|
|
2007-06-12 10:35:45 +00:00
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
2015-11-17 06:07:57 +00:00
|
|
|
inode_nohighmem(inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
inode->i_mapping->a_ops = &btrfs_symlink_aops;
|
2008-10-30 18:25:28 +00:00
|
|
|
inode_set_bytes(inode, name_len);
|
2017-02-20 11:50:34 +00:00
|
|
|
btrfs_i_size_write(BTRFS_I(inode), name_len);
|
2007-06-22 18:16:25 +00:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
2015-12-31 18:08:24 +00:00
|
|
|
/*
|
|
|
|
* Last step, add directory indexes for our symlink inode. This is the
|
|
|
|
* last step to avoid extra cleanup of these indexes if an error happens
|
|
|
|
* elsewhere above.
|
|
|
|
*/
|
|
|
|
if (!err)
|
2017-02-20 11:51:09 +00:00
|
|
|
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
|
|
|
|
BTRFS_I(inode), 0, index);
|
2014-09-08 20:08:51 +00:00
|
|
|
if (err) {
|
2007-06-22 18:16:25 +00:00
|
|
|
drop_inode = 1;
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_unlock_inode;
|
|
|
|
}
|
|
|
|
|
2018-05-04 12:23:01 +00:00
|
|
|
d_instantiate_new(dentry, inode);
|
2007-06-12 10:35:45 +00:00
|
|
|
|
|
|
|
out_unlock:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2007-06-12 10:35:45 +00:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2007-06-12 10:35:45 +00:00
|
|
|
return err;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
out_unlock_inode:
|
|
|
|
drop_inode = 1;
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_unlock;
|
2007-06-12 10:35:45 +00:00
|
|
|
}
|
2008-04-10 14:23:21 +00:00
|
|
|
|
2010-06-21 18:48:16 +00:00
|
|
|
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint,
|
|
|
|
struct btrfs_trans_handle *trans)
|
2008-10-30 18:25:28 +00:00
|
|
|
{
|
2016-06-22 22:54:23 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_map *em;
|
2008-10-30 18:25:28 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 cur_offset = start;
|
2010-11-22 18:50:32 +00:00
|
|
|
u64 i_size;
|
2013-03-05 16:11:26 +00:00
|
|
|
u64 cur_bytes;
|
2015-09-23 21:11:16 +00:00
|
|
|
u64 last_alloc = (u64)-1;
|
2008-10-30 18:25:28 +00:00
|
|
|
int ret = 0;
|
2010-06-21 18:48:16 +00:00
|
|
|
bool own_trans = true;
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
u64 end = start + num_bytes - 1;
|
2008-10-30 18:25:28 +00:00
|
|
|
|
2010-06-21 18:48:16 +00:00
|
|
|
if (trans)
|
|
|
|
own_trans = false;
|
2008-10-30 18:25:28 +00:00
|
|
|
while (num_bytes > 0) {
|
2010-06-21 18:48:16 +00:00
|
|
|
if (own_trans) {
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
break;
|
|
|
|
}
|
2009-11-12 09:34:52 +00:00
|
|
|
}
|
|
|
|
|
2015-12-14 16:42:10 +00:00
|
|
|
cur_bytes = min_t(u64, num_bytes, SZ_256M);
|
2013-03-05 16:11:26 +00:00
|
|
|
cur_bytes = max(cur_bytes, min_size);
|
2015-09-23 21:11:16 +00:00
|
|
|
/*
|
|
|
|
* If we are severely fragmented we could end up with really
|
|
|
|
* small allocations, so if the allocator is returning small
|
|
|
|
* chunks lets make its job easier by only searching for those
|
|
|
|
* sized chunks.
|
|
|
|
*/
|
|
|
|
cur_bytes = min(cur_bytes, last_alloc);
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
|
|
|
|
min_size, 0, *alloc_hint, &ins, 1, 0);
|
2009-11-12 09:34:52 +00:00
|
|
|
if (ret) {
|
2010-06-21 18:48:16 +00:00
|
|
|
if (own_trans)
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2010-05-16 14:48:46 +00:00
|
|
|
break;
|
2008-10-30 18:25:28 +00:00
|
|
|
}
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
2009-11-12 09:34:52 +00:00
|
|
|
|
2015-09-23 21:11:16 +00:00
|
|
|
last_alloc = ins.offset;
|
2008-10-30 18:25:28 +00:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
cur_offset, ins.objectid,
|
|
|
|
ins.offset, ins.offset,
|
2009-11-12 09:34:08 +00:00
|
|
|
ins.offset, 0, 0, 0,
|
2008-10-30 18:25:28 +00:00
|
|
|
BTRFS_FILE_EXTENT_PREALLOC);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (ret) {
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:50 +00:00
|
|
|
ins.offset, 0);
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (own_trans)
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2012-03-12 15:03:00 +00:00
|
|
|
break;
|
|
|
|
}
|
2014-12-12 08:44:35 +00:00
|
|
|
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
|
2009-09-11 16:27:37 +00:00
|
|
|
cur_offset + ins.offset -1, 0);
|
2009-11-12 09:34:52 +00:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
em = alloc_extent_map();
|
|
|
|
if (!em) {
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
em->start = cur_offset;
|
|
|
|
em->orig_start = cur_offset;
|
|
|
|
em->len = ins.offset;
|
|
|
|
em->block_start = ins.objectid;
|
|
|
|
em->block_len = ins.offset;
|
2012-12-03 15:31:19 +00:00
|
|
|
em->orig_block_len = ins.offset;
|
2013-04-04 18:31:27 +00:00
|
|
|
em->ram_bytes = ins.offset;
|
2016-06-22 22:54:23 +00:00
|
|
|
em->bdev = fs_info->fs_devices->latest_bdev;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
|
|
|
|
em->generation = trans->transid;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-05 20:51:15 +00:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
if (ret != -EEXIST)
|
|
|
|
break;
|
2017-02-20 11:50:45 +00:00
|
|
|
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 17:14:17 +00:00
|
|
|
cur_offset + ins.offset - 1,
|
|
|
|
0);
|
|
|
|
}
|
|
|
|
free_extent_map(em);
|
|
|
|
next:
|
2008-10-30 18:25:28 +00:00
|
|
|
num_bytes -= ins.offset;
|
|
|
|
cur_offset += ins.offset;
|
2010-05-16 14:49:59 +00:00
|
|
|
*alloc_hint = ins.objectid + ins.offset;
|
2009-11-12 09:34:52 +00:00
|
|
|
|
2012-04-05 19:03:02 +00:00
|
|
|
inode_inc_iversion(inode);
|
2016-09-14 14:48:06 +00:00
|
|
|
inode->i_ctime = current_time(inode);
|
2009-04-17 08:37:41 +00:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
|
2008-10-30 18:25:28 +00:00
|
|
|
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
2010-05-16 14:49:59 +00:00
|
|
|
(actual_len > inode->i_size) &&
|
|
|
|
(cur_offset > inode->i_size)) {
|
2010-01-20 07:28:54 +00:00
|
|
|
if (cur_offset > actual_len)
|
2010-11-22 18:50:32 +00:00
|
|
|
i_size = actual_len;
|
2010-01-20 07:28:54 +00:00
|
|
|
else
|
2010-11-22 18:50:32 +00:00
|
|
|
i_size = cur_offset;
|
|
|
|
i_size_write(inode, i_size);
|
|
|
|
btrfs_ordered_update_i_size(inode, i_size, NULL);
|
2009-11-12 09:34:52 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 18:25:28 +00:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2012-03-12 15:03:00 +00:00
|
|
|
|
|
|
|
if (ret) {
|
2016-06-10 22:19:25 +00:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 15:03:00 +00:00
|
|
|
if (own_trans)
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2012-03-12 15:03:00 +00:00
|
|
|
break;
|
|
|
|
}
|
2008-10-30 18:25:28 +00:00
|
|
|
|
2010-06-21 18:48:16 +00:00
|
|
|
if (own_trans)
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2009-11-12 09:34:52 +00:00
|
|
|
}
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
if (cur_offset < end)
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 07:10:39 +00:00
|
|
|
btrfs_free_reserved_data_space(inode, NULL, cur_offset,
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 07:51:40 +00:00
|
|
|
end - cur_offset + 1);
|
2008-10-30 18:25:28 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-06-21 18:48:16 +00:00
|
|
|
int btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
{
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
min_size, actual_len, alloc_hint,
|
|
|
|
NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_prealloc_file_range_trans(struct inode *inode,
|
|
|
|
struct btrfs_trans_handle *trans, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
{
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
min_size, actual_len, alloc_hint, trans);
|
|
|
|
}
|
|
|
|
|
2008-07-17 16:53:50 +00:00
|
|
|
static int btrfs_set_page_dirty(struct page *page)
|
|
|
|
{
|
|
|
|
return __set_page_dirty_nobuffers(page);
|
|
|
|
}
|
|
|
|
|
2011-06-20 23:28:19 +00:00
|
|
|
static int btrfs_permission(struct inode *inode, int mask)
|
2008-01-14 18:26:08 +00:00
|
|
|
{
|
2010-12-20 08:04:08 +00:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-08-15 17:27:21 +00:00
|
|
|
umode_t mode = inode->i_mode;
|
2010-12-20 08:04:08 +00:00
|
|
|
|
2011-08-15 17:27:21 +00:00
|
|
|
if (mask & MAY_WRITE &&
|
|
|
|
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
|
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
|
|
|
|
return -EACCES;
|
|
|
|
}
|
2011-06-20 23:16:29 +00:00
|
|
|
return generic_permission(inode, mask);
|
2008-01-14 18:26:08 +00:00
|
|
|
}
|
2007-06-12 10:35:45 +00:00
|
|
|
|
2014-04-27 19:40:45 +00:00
|
|
|
static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
|
2014-04-27 19:40:45 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct inode *inode = NULL;
|
|
|
|
u64 objectid;
|
|
|
|
u64 index;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 5 units required for adding orphan entry
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
|
|
ret = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
inode = btrfs_new_inode(trans, root, dir, NULL, 0,
|
2017-01-20 13:54:07 +00:00
|
|
|
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
|
2014-04-27 19:40:45 +00:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
ret = PTR_ERR(inode);
|
|
|
|
inode = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
|
|
|
|
2014-09-08 20:08:51 +00:00
|
|
|
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
|
|
|
|
if (ret)
|
|
|
|
goto out_inode;
|
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret)
|
|
|
|
goto out_inode;
|
2017-02-20 11:50:59 +00:00
|
|
|
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
2014-04-27 19:40:45 +00:00
|
|
|
if (ret)
|
2014-09-08 20:08:51 +00:00
|
|
|
goto out_inode;
|
2014-04-27 19:40:45 +00:00
|
|
|
|
2014-07-31 23:10:32 +00:00
|
|
|
/*
|
|
|
|
* We set number of links to 0 in btrfs_new_inode(), and here we set
|
|
|
|
* it to 1 because d_tmpfile() will issue a warning if the count is 0,
|
|
|
|
* through:
|
|
|
|
*
|
|
|
|
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
|
|
|
|
*/
|
|
|
|
set_nlink(inode, 1);
|
2014-09-08 20:08:51 +00:00
|
|
|
unlock_new_inode(inode);
|
2014-04-27 19:40:45 +00:00
|
|
|
d_tmpfile(dentry, inode);
|
|
|
|
mark_inode_dirty(inode);
|
|
|
|
|
|
|
|
out:
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_end_transaction(trans);
|
2014-04-27 19:40:45 +00:00
|
|
|
if (ret)
|
|
|
|
iput(inode);
|
2016-06-22 22:54:24 +00:00
|
|
|
btrfs_btree_balance_dirty(fs_info);
|
2014-04-27 19:40:45 +00:00
|
|
|
return ret;
|
2014-09-08 20:08:51 +00:00
|
|
|
|
|
|
|
out_inode:
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out;
|
|
|
|
|
2014-04-27 19:40:45 +00:00
|
|
|
}
|
|
|
|
|
2017-02-17 15:24:29 +00:00
|
|
|
__attribute__((const))
|
2017-03-24 22:04:50 +00:00
|
|
|
static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
|
2017-02-17 15:24:29 +00:00
|
|
|
{
|
2017-03-24 22:04:50 +00:00
|
|
|
return -EAGAIN;
|
2017-02-17 15:24:29 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 15:57:13 +00:00
|
|
|
static void btrfs_check_extent_io_range(void *private_data, const char *caller,
|
|
|
|
u64 start, u64 end)
|
|
|
|
{
|
|
|
|
struct inode *inode = private_data;
|
|
|
|
u64 isize;
|
|
|
|
|
|
|
|
isize = i_size_read(inode);
|
|
|
|
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
|
|
|
|
btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
|
|
|
|
"%s: ino %llu isize %llu odd range [%llu,%llu]",
|
|
|
|
caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-18 18:32:52 +00:00
|
|
|
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
|
2017-05-05 15:57:13 +00:00
|
|
|
{
|
2018-07-18 18:32:52 +00:00
|
|
|
struct inode *inode = tree->private_data;
|
2017-05-05 15:57:13 +00:00
|
|
|
unsigned long index = start >> PAGE_SHIFT;
|
|
|
|
unsigned long end_index = end >> PAGE_SHIFT;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
while (index <= end_index) {
|
|
|
|
page = find_get_page(inode->i_mapping, index);
|
|
|
|
ASSERT(page); /* Pages should be in the extent_io_tree */
|
|
|
|
set_page_writeback(page);
|
|
|
|
put_page(page);
|
|
|
|
index++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-22 00:01:11 +00:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations = {
|
2008-11-18 01:42:26 +00:00
|
|
|
.getattr = btrfs_getattr,
|
2007-06-12 10:35:45 +00:00
|
|
|
.lookup = btrfs_lookup,
|
|
|
|
.create = btrfs_create,
|
|
|
|
.unlink = btrfs_unlink,
|
|
|
|
.link = btrfs_link,
|
|
|
|
.mkdir = btrfs_mkdir,
|
|
|
|
.rmdir = btrfs_rmdir,
|
2016-09-27 09:03:58 +00:00
|
|
|
.rename = btrfs_rename2,
|
2007-06-12 10:35:45 +00:00
|
|
|
.symlink = btrfs_symlink,
|
|
|
|
.setattr = btrfs_setattr,
|
2007-07-11 14:18:17 +00:00
|
|
|
.mknod = btrfs_mknod,
|
2007-11-16 16:45:54 +00:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-01-14 18:26:08 +00:00
|
|
|
.permission = btrfs_permission,
|
2011-07-23 15:37:31 +00:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 13:16:43 +00:00
|
|
|
.set_acl = btrfs_set_acl,
|
2013-09-16 17:42:03 +00:00
|
|
|
.update_time = btrfs_update_time,
|
2014-04-27 19:40:45 +00:00
|
|
|
.tmpfile = btrfs_tmpfile,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
2009-09-22 00:01:11 +00:00
|
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations = {
|
2007-06-12 10:35:45 +00:00
|
|
|
.lookup = btrfs_lookup,
|
2008-01-14 18:26:08 +00:00
|
|
|
.permission = btrfs_permission,
|
2013-09-16 17:42:03 +00:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2009-10-01 22:43:56 +00:00
|
|
|
static const struct file_operations btrfs_dir_file_operations = {
|
2007-06-12 10:35:45 +00:00
|
|
|
.llseek = generic_file_llseek,
|
|
|
|
.read = generic_read_dir,
|
2016-05-20 20:50:33 +00:00
|
|
|
.iterate_shared = btrfs_real_readdir,
|
2017-07-24 19:14:25 +00:00
|
|
|
.open = btrfs_opendir,
|
2007-09-14 14:22:47 +00:00
|
|
|
.unlocked_ioctl = btrfs_ioctl,
|
2007-06-12 10:35:45 +00:00
|
|
|
#ifdef CONFIG_COMPAT
|
2015-10-29 08:22:21 +00:00
|
|
|
.compat_ioctl = btrfs_compat_ioctl,
|
2007-06-12 10:35:45 +00:00
|
|
|
#endif
|
2008-06-10 14:07:39 +00:00
|
|
|
.release = btrfs_release_file,
|
2008-09-05 20:13:11 +00:00
|
|
|
.fsync = btrfs_sync_file,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
|
|
|
|
2015-11-19 10:42:28 +00:00
|
|
|
static const struct extent_io_ops btrfs_extent_io_ops = {
|
2017-02-17 14:27:44 +00:00
|
|
|
/* mandatory callbacks */
|
2008-02-20 17:07:25 +00:00
|
|
|
.submit_bio_hook = btrfs_submit_bio_hook,
|
2007-08-30 12:50:51 +00:00
|
|
|
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
|
2017-03-24 22:04:50 +00:00
|
|
|
.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
|
2017-02-17 14:27:44 +00:00
|
|
|
|
|
|
|
/* optional callbacks */
|
|
|
|
.fill_delalloc = run_delalloc_range,
|
2008-07-17 16:53:50 +00:00
|
|
|
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
|
2008-07-17 16:53:51 +00:00
|
|
|
.writepage_start_hook = btrfs_writepage_start_hook,
|
2008-01-31 16:05:37 +00:00
|
|
|
.set_bit_hook = btrfs_set_bit_hook,
|
|
|
|
.clear_bit_hook = btrfs_clear_bit_hook,
|
2009-09-11 20:12:44 +00:00
|
|
|
.merge_extent_hook = btrfs_merge_extent_hook,
|
|
|
|
.split_extent_hook = btrfs_split_extent_hook,
|
2017-05-05 15:57:13 +00:00
|
|
|
.check_extent_io_range = btrfs_check_extent_io_range,
|
2007-08-30 12:50:51 +00:00
|
|
|
};
|
|
|
|
|
2009-01-21 18:11:13 +00:00
|
|
|
/*
|
|
|
|
* btrfs doesn't support the bmap operation because swapfiles
|
|
|
|
* use bmap to make a mapping of extents in the file. They assume
|
|
|
|
* these extents won't change over the life of the file and they
|
|
|
|
* use the bmap result to do IO directly to the drive.
|
|
|
|
*
|
|
|
|
* the btrfs bmap call would return logical addresses that aren't
|
|
|
|
* suitable for IO and they also will change frequently as COW
|
|
|
|
* operations happen. So, swapfile + btrfs == corruption.
|
|
|
|
*
|
|
|
|
* For now we're avoiding this by dropping bmap.
|
|
|
|
*/
|
2009-09-22 00:01:10 +00:00
|
|
|
static const struct address_space_operations btrfs_aops = {
|
2007-06-12 10:35:45 +00:00
|
|
|
.readpage = btrfs_readpage,
|
|
|
|
.writepage = btrfs_writepage,
|
2007-11-01 23:45:34 +00:00
|
|
|
.writepages = btrfs_writepages,
|
2007-11-08 15:59:22 +00:00
|
|
|
.readpages = btrfs_readpages,
|
2008-04-10 14:23:21 +00:00
|
|
|
.direct_IO = btrfs_direct_IO,
|
2007-08-27 20:49:44 +00:00
|
|
|
.invalidatepage = btrfs_invalidatepage,
|
|
|
|
.releasepage = btrfs_releasepage,
|
2008-07-17 16:53:50 +00:00
|
|
|
.set_page_dirty = btrfs_set_page_dirty,
|
2009-09-16 09:50:18 +00:00
|
|
|
.error_remove_page = generic_error_remove_page,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
|
|
|
|
2009-09-22 00:01:10 +00:00
|
|
|
static const struct address_space_operations btrfs_symlink_aops = {
|
2007-06-12 10:35:45 +00:00
|
|
|
.readpage = btrfs_readpage,
|
|
|
|
.writepage = btrfs_writepage,
|
2007-08-30 15:54:02 +00:00
|
|
|
.invalidatepage = btrfs_invalidatepage,
|
|
|
|
.releasepage = btrfs_releasepage,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
|
|
|
|
2009-09-22 00:01:11 +00:00
|
|
|
static const struct inode_operations btrfs_file_inode_operations = {
|
2007-06-12 10:35:45 +00:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
.setattr = btrfs_setattr,
|
2007-11-16 16:45:54 +00:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-01-14 18:26:08 +00:00
|
|
|
.permission = btrfs_permission,
|
2009-01-21 19:39:14 +00:00
|
|
|
.fiemap = btrfs_fiemap,
|
2011-07-23 15:37:31 +00:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 13:16:43 +00:00
|
|
|
.set_acl = btrfs_set_acl,
|
2012-03-26 13:46:47 +00:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
2009-09-22 00:01:11 +00:00
|
|
|
static const struct inode_operations btrfs_special_inode_operations = {
|
2007-07-11 14:18:17 +00:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
.setattr = btrfs_setattr,
|
2008-01-14 18:26:08 +00:00
|
|
|
.permission = btrfs_permission,
|
2008-07-24 16:16:36 +00:00
|
|
|
.listxattr = btrfs_listxattr,
|
2011-07-23 15:37:31 +00:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 13:16:43 +00:00
|
|
|
.set_acl = btrfs_set_acl,
|
2012-03-26 13:46:47 +00:00
|
|
|
.update_time = btrfs_update_time,
|
2007-07-11 14:18:17 +00:00
|
|
|
};
|
2009-09-22 00:01:11 +00:00
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations = {
|
2015-11-17 15:20:54 +00:00
|
|
|
.get_link = page_get_link,
|
2010-11-19 02:05:24 +00:00
|
|
|
.getattr = btrfs_getattr,
|
2011-11-30 15:45:38 +00:00
|
|
|
.setattr = btrfs_setattr,
|
2008-01-14 18:26:08 +00:00
|
|
|
.permission = btrfs_permission,
|
2009-02-04 14:29:13 +00:00
|
|
|
.listxattr = btrfs_listxattr,
|
2012-03-26 13:46:47 +00:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 10:35:45 +00:00
|
|
|
};
|
2009-09-21 20:00:26 +00:00
|
|
|
|
2009-10-09 13:54:36 +00:00
|
|
|
const struct dentry_operations btrfs_dentry_operations = {
|
2009-09-21 20:00:26 +00:00
|
|
|
.d_delete = btrfs_dentry_delete,
|
|
|
|
};
|