linux/fs/btrfs/ordered-data.c
Chris Mason e6dcd2dc9c Btrfs: New data=ordered implementation
The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-25 11:04:04 -04:00

409 lines
10 KiB
C

/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
if (entry->file_offset + entry->len < entry->file_offset)
return (u64)-1;
return entry->file_offset + entry->len;
}
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
struct rb_node *node)
{
struct rb_node ** p = &root->rb_node;
struct rb_node * parent = NULL;
struct btrfs_ordered_extent *entry;
while(*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
if (file_offset < entry->file_offset)
p = &(*p)->rb_left;
else if (file_offset >= entry_end(entry))
p = &(*p)->rb_right;
else
return parent;
}
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
}
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
struct rb_node **prev_ret)
{
struct rb_node * n = root->rb_node;
struct rb_node *prev = NULL;
struct rb_node *test;
struct btrfs_ordered_extent *entry;
struct btrfs_ordered_extent *prev_entry = NULL;
while(n) {
entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
prev = n;
prev_entry = entry;
if (file_offset < entry->file_offset)
n = n->rb_left;
else if (file_offset >= entry_end(entry))
n = n->rb_right;
else
return n;
}
if (!prev_ret)
return NULL;
while(prev && file_offset >= entry_end(prev_entry)) {
test = rb_next(prev);
if (!test)
break;
prev_entry = rb_entry(test, struct btrfs_ordered_extent,
rb_node);
if (file_offset < entry_end(prev_entry))
break;
prev = test;
}
if (prev)
prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
rb_node);
while(prev && file_offset < entry_end(prev_entry)) {
test = rb_prev(prev);
if (!test)
break;
prev_entry = rb_entry(test, struct btrfs_ordered_extent,
rb_node);
prev = test;
}
*prev_ret = prev;
return NULL;
}
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
if (file_offset < entry->file_offset ||
entry->file_offset + entry->len <= file_offset)
return 0;
return 1;
}
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
u64 file_offset)
{
struct rb_root *root = &tree->tree;
struct rb_node *prev;
struct rb_node *ret;
struct btrfs_ordered_extent *entry;
if (tree->last) {
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
rb_node);
if (offset_in_entry(entry, file_offset))
return tree->last;
}
ret = __tree_search(root, file_offset, &prev);
if (!ret)
ret = prev;
if (ret)
tree->last = ret;
return ret;
}
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
tree = &BTRFS_I(inode)->ordered_tree;
entry = kzalloc(sizeof(*entry), GFP_NOFS);
if (!entry)
return -ENOMEM;
mutex_lock(&tree->mutex);
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
entry->inode = inode;
/* one ref for the tree */
atomic_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node) {
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
}
set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
entry_end(entry) - 1, GFP_NOFS);
set_bit(BTRFS_ORDERED_START, &entry->flags);
mutex_unlock(&tree->mutex);
BUG_ON(node);
return 0;
}
int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
node = tree_search(tree, sum->file_offset);
if (!node) {
search_fail:
printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
node = rb_first(&tree->tree);
while(node) {
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
node = rb_next(node);
}
BUG();
}
BUG_ON(!node);
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!offset_in_entry(entry, sum->file_offset)) {
goto search_fail;
}
list_add_tail(&sum->list, &entry->list);
mutex_unlock(&tree->mutex);
return 0;
}
int btrfs_dec_test_ordered_pending(struct inode *inode,
u64 file_offset, u64 io_size)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
GFP_NOFS);
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
goto out;
}
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!offset_in_entry(entry, file_offset)) {
ret = 1;
goto out;
}
ret = test_range_bit(io_tree, entry->file_offset,
entry->file_offset + entry->len - 1,
EXTENT_ORDERED, 0);
if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
}
if (ret == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
out:
mutex_unlock(&tree->mutex);
return ret == 0;
}
int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
if (atomic_dec_and_test(&entry->refs))
kfree(entry);
return 0;
}
int btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
mutex_unlock(&tree->mutex);
wake_up(&entry->wait);
return 0;
}
void btrfs_wait_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry)
{
u64 start = entry->file_offset;
u64 end = start + entry->len - 1;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
#else
do_sync_mapping_range(inode->i_mapping, start, end,
SYNC_FILE_RANGE_WRITE);
#endif
wait_event(entry->wait,
test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
}
static void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->len - 1;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
#else
do_sync_mapping_range(inode->i_mapping, start, end,
SYNC_FILE_RANGE_WRITE);
#endif
if (wait)
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
}
void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
u64 end;
struct btrfs_ordered_extent *ordered;
int found;
int should_wait = 0;
again:
if (start + len < start)
end = (u64)-1;
else
end = start + len - 1;
found = 0;
while(1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered) {
break;
}
if (ordered->file_offset >= start + len) {
btrfs_put_ordered_extent(ordered);
break;
}
if (ordered->file_offset + ordered->len < start) {
btrfs_put_ordered_extent(ordered);
break;
}
btrfs_start_ordered_extent(inode, ordered, should_wait);
found++;
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
if (end == 0)
break;
end--;
}
if (should_wait && found) {
should_wait = 0;
goto again;
}
}
int btrfs_add_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent *ordered,
u64 start, u64 len)
{
WARN_ON(1);
return 0;
#if 0
int ret;
struct btrfs_ordered_inode_tree *tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
ret = -EAGAIN;
goto out;
}
set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
ret = 0;
out:
mutex_unlock(&tree->mutex);
return ret;
#endif
}
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
node = tree_search(tree, file_offset);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!offset_in_entry(entry, file_offset))
entry = NULL;
if (entry)
atomic_inc(&entry->refs);
out:
mutex_unlock(&tree->mutex);
return entry;
}
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
node = tree_search(tree, file_offset);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
out:
mutex_unlock(&tree->mutex);
return entry;
}